{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "0336fd0c", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "✅ База загружена. Треков: 1744\n", "🔍 Собираем акустические признаки...\n", "\n", "🚀 ГОТОВО! Обогащенная база сохранена: ../../dataset/DEAM/music_db_enriched.csv\n", "Собрано фичей для 1744 из 1744 треков.\n", " song_id valence arousal energy flux centroid pitch \\\n", "0 2 3.1 3.0 0.097268 0.846947 483.421751 93.884056 \n", "1 3 3.5 3.3 0.126809 0.959460 173.219616 62.682589 \n", "2 4 5.7 5.5 0.156699 1.333944 466.434797 92.850316 \n", "3 5 4.4 5.3 0.126455 1.009927 546.152506 158.673853 \n", "4 7 5.8 6.4 0.268180 1.589191 175.369162 83.823484 \n", "\n", " hnr zcr entropy sharpness \n", "0 3.615380 0.034270 3.299075 0.426490 \n", "1 -2.600122 0.017893 2.294971 0.165583 \n", "2 -0.579130 0.042936 3.258138 0.395410 \n", "3 1.751148 0.043781 3.514585 0.494367 \n", "4 12.006770 0.014783 2.177862 0.170058 \n" ] } ], "source": [ "import pandas as pd\n", "import numpy as np\n", "from pathlib import Path\n", "from tqdm import tqdm # для красивого прогресс-бара, если не установлен - убери\n", "\n", "# 1. Пути к файлам\n", "base_dir = Path(\"../../dataset/DEAM\") # Поправь, если запускаешь из другого места\n", "music_db_path = base_dir / \"music_db.csv\"\n", "features_dir = base_dir / \"features\" / \"features\"\n", "output_path = base_dir / \"music_db_enriched.csv\"\n", "\n", "# 2. Наш \"Золотой список\" (8 признаков)\n", "target_columns = {\n", " 'pcm_RMSenergy_sma_amean': 'energy',\n", " 'pcm_fftMag_spectralFlux_sma_amean': 'flux',\n", " 'pcm_fftMag_spectralCentroid_sma_amean': 'centroid',\n", " 'F0final_sma_amean': 'pitch',\n", " 'logHNR_sma_amean': 'hnr',\n", " 'pcm_zcr_sma_amean': 'zcr',\n", " 'pcm_fftMag_spectralEntropy_sma_amean': 'entropy',\n", " 'pcm_fftMag_psySharpness_sma_amean': 'sharpness'\n", "}\n", "\n", "# 3. Загружаем текущую базу с V/A\n", "if not music_db_path.exists():\n", " print(f\"❌ ОШИБКА: Не найден файл {music_db_path}\")\n", "else:\n", " df_main = pd.read_csv(music_db_path)\n", " print(f\"✅ База загружена. Треков: {len(df_main)}\")\n", "\n", " # Подготавливаем новые колонки\n", " for col_name in target_columns.values():\n", " df_main[col_name] = np.nan\n", "\n", " # 4. Проходимся по всем трекам и ищем их акустические CSV\n", " print(\"🔍 Собираем акустические признаки...\")\n", " found_count = 0\n", " \n", " for index, row in df_main.iterrows():\n", " song_id = int(row['song_id'])\n", " feature_file = features_dir / f\"{song_id}.csv\"\n", " \n", " if feature_file.exists():\n", " try:\n", " # Читаем CSV с признаками (разделитель там обычно точка с запятой)\n", " df_feat = pd.read_csv(feature_file, sep=';')\n", " \n", " # Усредняем значения по всем фреймам (одна песня разбита на сотни строк-фреймов)\n", " mean_features = df_feat[list(target_columns.keys())].mean()\n", " \n", " # Записываем в главную базу\n", " for orig_col, new_col in target_columns.items():\n", " df_main.at[index, new_col] = mean_features[orig_col]\n", " \n", " found_count += 1\n", " except Exception as e:\n", " print(f\"Ошибка чтения {feature_file}: {e}\")\n", " \n", " # 5. Сохраняем результат\n", " # Удаляем треки, для которых не нашлось фичей (если такие есть)\n", " df_main = df_main.dropna(subset=list(target_columns.values()))\n", " \n", " df_main.to_csv(output_path, index=False)\n", " print(f\"\\n🚀 ГОТОВО! Обогащенная база сохранена: {output_path}\")\n", " print(f\"Собрано фичей для {found_count} из {len(df_main)} треков.\")\n", " print(df_main.head())" ] } ], "metadata": { "kernelspec": { "display_name": "Python (thesis)", "language": "python", "name": "thesis" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 5 }