Files
Thesis/src/scripts/aggregate_features.ipynb
T
2026-05-06 22:31:52 +00:00

126 lines
5.5 KiB
Plaintext

{
"cells": [
{
"cell_type": "code",
"execution_count": 2,
"id": "0336fd0c",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"✅ База загружена. Треков: 1744\n",
"🔍 Собираем акустические признаки...\n",
"\n",
"🚀 ГОТОВО! Обогащенная база сохранена: ../../dataset/DEAM/music_db_enriched.csv\n",
"Собрано фичей для 1744 из 1744 треков.\n",
" song_id valence arousal energy flux centroid pitch \\\n",
"0 2 3.1 3.0 0.097268 0.846947 483.421751 93.884056 \n",
"1 3 3.5 3.3 0.126809 0.959460 173.219616 62.682589 \n",
"2 4 5.7 5.5 0.156699 1.333944 466.434797 92.850316 \n",
"3 5 4.4 5.3 0.126455 1.009927 546.152506 158.673853 \n",
"4 7 5.8 6.4 0.268180 1.589191 175.369162 83.823484 \n",
"\n",
" hnr zcr entropy sharpness \n",
"0 3.615380 0.034270 3.299075 0.426490 \n",
"1 -2.600122 0.017893 2.294971 0.165583 \n",
"2 -0.579130 0.042936 3.258138 0.395410 \n",
"3 1.751148 0.043781 3.514585 0.494367 \n",
"4 12.006770 0.014783 2.177862 0.170058 \n"
]
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from pathlib import Path\n",
"from tqdm import tqdm # для красивого прогресс-бара, если не установлен - убери\n",
"\n",
"# 1. Пути к файлам\n",
"base_dir = Path(\"../../dataset/DEAM\") # Поправь, если запускаешь из другого места\n",
"music_db_path = base_dir / \"music_db.csv\"\n",
"features_dir = base_dir / \"features\" / \"features\"\n",
"output_path = base_dir / \"music_db_enriched.csv\"\n",
"\n",
"# 2. Наш \"Золотой список\" (8 признаков)\n",
"target_columns = {\n",
" 'pcm_RMSenergy_sma_amean': 'energy',\n",
" 'pcm_fftMag_spectralFlux_sma_amean': 'flux',\n",
" 'pcm_fftMag_spectralCentroid_sma_amean': 'centroid',\n",
" 'F0final_sma_amean': 'pitch',\n",
" 'logHNR_sma_amean': 'hnr',\n",
" 'pcm_zcr_sma_amean': 'zcr',\n",
" 'pcm_fftMag_spectralEntropy_sma_amean': 'entropy',\n",
" 'pcm_fftMag_psySharpness_sma_amean': 'sharpness'\n",
"}\n",
"\n",
"# 3. Загружаем текущую базу с V/A\n",
"if not music_db_path.exists():\n",
" print(f\"❌ ОШИБКА: Не найден файл {music_db_path}\")\n",
"else:\n",
" df_main = pd.read_csv(music_db_path)\n",
" print(f\"✅ База загружена. Треков: {len(df_main)}\")\n",
"\n",
" # Подготавливаем новые колонки\n",
" for col_name in target_columns.values():\n",
" df_main[col_name] = np.nan\n",
"\n",
" # 4. Проходимся по всем трекам и ищем их акустические CSV\n",
" print(\"🔍 Собираем акустические признаки...\")\n",
" found_count = 0\n",
" \n",
" for index, row in df_main.iterrows():\n",
" song_id = int(row['song_id'])\n",
" feature_file = features_dir / f\"{song_id}.csv\"\n",
" \n",
" if feature_file.exists():\n",
" try:\n",
" # Читаем CSV с признаками (разделитель там обычно точка с запятой)\n",
" df_feat = pd.read_csv(feature_file, sep=';')\n",
" \n",
" # Усредняем значения по всем фреймам (одна песня разбита на сотни строк-фреймов)\n",
" mean_features = df_feat[list(target_columns.keys())].mean()\n",
" \n",
" # Записываем в главную базу\n",
" for orig_col, new_col in target_columns.items():\n",
" df_main.at[index, new_col] = mean_features[orig_col]\n",
" \n",
" found_count += 1\n",
" except Exception as e:\n",
" print(f\"Ошибка чтения {feature_file}: {e}\")\n",
" \n",
" # 5. Сохраняем результат\n",
" # Удаляем треки, для которых не нашлось фичей (если такие есть)\n",
" df_main = df_main.dropna(subset=list(target_columns.values()))\n",
" \n",
" df_main.to_csv(output_path, index=False)\n",
" print(f\"\\n🚀 ГОТОВО! Обогащенная база сохранена: {output_path}\")\n",
" print(f\"Собрано фичей для {found_count} из {len(df_main)} треков.\")\n",
" print(df_main.head())"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python (thesis)",
"language": "python",
"name": "thesis"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}