ref: refactor before chekout

2026-06-02 17:27:05 +00:00
parent 9ce92b70a9
commit f04cd7359b
27 changed files with 1103 additions and 4266 deletions
@@ -1,49 +1,62 @@
+import numpy as np
+from pathlib import Path
+from PIL import Image
+
 import torch
 import torchvision.transforms as T
-from PIL import Image
 import timm
-from pathlib import Path
-import numpy as np
 from transformers import Blip2Processor, Blip2ForConditionalGeneration

 class ImageProcessor:
-    def __init__(self, model_path: Path | str):
+    def __init__(self, weights_path: str | Path):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
-        self.emo_model = timm.create_model('resnet50', pretrained=False, num_classes=8)
-        if Path(model_path).exists():
-            self.emo_model.load_state_dict(torch.load(model_path, map_location=self.device))
+        # Модель извлечения визуальных признаков
+        self.feature_extractor = timm.create_model('resnet50', pretrained=False, num_classes=8)
        
-        self.emo_model.fc = torch.nn.Identity()
-        self.emo_model.to(self.device).eval()
+        if Path(weights_path).exists():
+            self.feature_extractor.load_state_dict(torch.load(weights_path, map_location=self.device))
+        else:
+            print(f"Не удалось найти веса ResNet по пути: {weights_path}")
+            
+        # Удаление слоя классификации для вывода сырого вектора эмбеддингов
+        self.feature_extractor.fc = torch.nn.Identity()
+        self.feature_extractor.to(self.device).eval()

-        self.emo_transform = T.Compose([
+        # Трансформации для предварительной обработки изображений
+        self.preprocess_image = T.Compose([
            T.Resize((224, 224)),
            T.ToTensor(),
            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

-        print("Загрузка BLIP-2...")
+        # Модуль семантического описания сцены
+        print("Инициализация BLIP-2...")
        self.blip_processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
        self.blip_model = Blip2ForConditionalGeneration.from_pretrained(
            "Salesforce/blip2-opt-2.7b", 
            torch_dtype=torch.float16 
        ).to(self.device)
-        print("BLIP-2 и ResNet-50 готовы.")

    @torch.no_grad()
    def extract_embedding(self, image: Image.Image) -> np.ndarray:
-        img_rgb = image.convert('RGB')
-        img_tensor = self.emo_transform(img_rgb).unsqueeze(0).to(self.device)
-        return self.emo_model(img_tensor).cpu().numpy().flatten()
+        # Извлечение эмбеддингов из изображения
+        rgb_image = image.convert('RGB')
+        img_tensor = self.preprocess_image(rgb_image).unsqueeze(0).to(self.device)
+        
+        features = self.feature_extractor(img_tensor)
+        features_np = features.cpu().numpy()
+        
+        return features_np.flatten()
        
    @torch.no_grad()
    def describe_scene(self, image: Image.Image) -> str:
-        """Генерирует описание через BLIP-2."""
-        img_rgb = image.convert('RGB')
-        
-        inputs = self.blip_processor(images=img_rgb, return_tensors="pt").to(self.device, torch.float16)
+        # Генерация текстового описания сцены
+        rgb_image = image.convert('RGB')
        
+        inputs = self.blip_processor(images=rgb_image, return_tensors="pt").to(self.device, torch.float16)
        generated_ids = self.blip_model.generate(**inputs, max_new_tokens=40)
-        caption = self.blip_processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
-        return caption
+        
+        scene_description = self.blip_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        
+        return scene_description.strip()
@@ -1,31 +1,31 @@
-import requests
-import json
 import re
+import json
+import requests

 class LLMAcousticBridge:
-    def __init__(self, model_name="dolphin-llama3:8b"):
-        self.model_name = model_name
+    def __init__(self, target_model="dolphin-llama3:8b"):
        self.api_url = "http://localhost:11434/api/generate"
+        self.model = target_model

-    def _clean_json(self, text):
-        """Вытаскивает чистый JSON из ответа нейросети."""
+    def _extract_json(self, raw_text: str):
+        # Проверка на ИИдиота, LLM иногда игнорирует format="json" и оборачивает ответ в маркдаун
        try:
-            match = re.search(r'\{.*\}', text, re.DOTALL)
+            match = re.search(r'\{.*\}', raw_text, re.DOTALL)
            if match:
                return json.loads(match.group(0))
-            return json.loads(text)
-        except:
+            return json.loads(raw_text)
+        except json.JSONDecodeError:
+            # Если ИИдиот
            return None

-    def get_acoustic_profile(self, valence, arousal, scene_descriptions):
-        """Просит LLM сгенерировать идеальный звук под описание."""
-        # Объединяем описания, если загружено несколько фото
-        context_str = " | ".join(scene_descriptions) if scene_descriptions else "abstract scene"
+    def get_acoustic_profile(self, v_score: float, a_score: float, scene_context: list) -> dict | None:
+        # Агрегация контекста для обработки серии снимков (события)
+        context_merged = " | ".join(scene_context) if scene_context else "abstract scene"
        
-        prompt = f"""You are an expert music producer and acoustic engineer. 
+        system_prompt = f"""You are an expert music producer and acoustic engineer. 
 Analyze the visual context and emotions to determine the ideal background music properties.
-Emotions: Valence {valence:.1f}/9.0 (Positivity), Arousal {arousal:.1f}/9.0 (Energy).
-Visual Context: {context_str}.
+Emotions: Valence {v_score:.1f}/9.0 (Positivity), Arousal {a_score:.1f}/9.0 (Energy).
+Visual Context: {context_merged}.

 Map this scene to exactly 6 acoustic features. Values MUST be floats between 0.0 and 1.0.
 1. "energy": (Loudness/Density. High for massive/busy scenes, Low for calm)
@@ -39,22 +39,27 @@ Return ONLY a valid JSON object. Do not add any text or explanation.
 Example: {{"energy": 0.5, "flux": 0.2, "centroid": 0.4, "pitch": 0.3, "hnr": 0.8, "zcr": 0.1}}"""

        try:
+            # Отправка промпта локальной Ollama
            response = requests.post(self.api_url, json={
-                "model": self.model_name,
-                "prompt": prompt,
+                "model": self.model,
+                "prompt": system_prompt,
                "stream": False,
                "format": "json"
-            }, timeout=30)
+            }, timeout=45)
            response.raise_for_status()
            
-            result_text = response.json().get("response", "")
-            profile = self._clean_json(result_text)
+            raw_response = response.json().get("response", "")
+            profile_data = self._extract_json(raw_response)
            
-            # Проверяем, что все нужные ключи есть
-            required_keys = ['energy', 'flux', 'centroid', 'pitch', 'hnr', 'zcr']
-            if profile and all(k in profile for k in required_keys):
-                return profile
+            # Валидация структуры ответа
+            expected_features = {'energy', 'flux', 'centroid', 'pitch', 'hnr', 'zcr'}
+            
+            if profile_data and expected_features.issubset(profile_data.keys()):
+                return profile_data
+                
+            print("LLM вернула неполный или некорректный набор акустических признаков")
            return None
-        except Exception as e:
-            print(f"Ошибка связи с локальной LLM: {e}")
+            
+        except requests.exceptions.RequestException as req_err:
+            print(f"Не удалось подключиться к Ollama: {req_err}")
            return None
@@ -1,67 +1,83 @@
+import joblib
 import numpy as np
 import pandas as pd
 from pathlib import Path
-import joblib

 class MusicMatcher:
    def __init__(self, db_path: Path | str, model_path: Path | str):
-        # Загружаем твою новую, обогащенную базу
+        # Загрузка базы данных музыкальных произведений
        self.music_db = pd.read_csv(db_path)
        self.acoustic_features = ['energy', 'flux', 'centroid', 'pitch', 'hnr', 'zcr']
        
-        # Удаляем строки, где нет акустических фич
-        self.music_db = self.music_db.dropna(subset=['valence', 'arousal'] + self.acoustic_features)
+        # Удаление записей с пропущенными целевыми или акустическими признаками
+        target_columns = ['valence', 'arousal'] + self.acoustic_features
+        self.music_db = self.music_db.dropna(subset=target_columns)
        
-        # Нормализуем акустику от 0 до 1, чтобы сравнивать с ответом LLM
+        # Масштабирование акустических параметров к диапазону [0, 1]
        self.norm_db = self.music_db.copy()
        for feat in self.acoustic_features:
-            f_min, f_max = self.norm_db[feat].min(), self.norm_db[feat].max()
+            f_min = self.norm_db[feat].min()
+            f_max = self.norm_db[feat].max()
            if f_max > f_min:
                self.norm_db[f"norm_{feat}"] = (self.norm_db[feat] - f_min) / (f_max - f_min)
            else:
                self.norm_db[f"norm_{feat}"] = 0.0

+        # Определение путей к аудиофайлам и загрузка модели регрессии
        self.audio_dir = Path(db_path).parent / "DEAM_audio" / "MEMD_audio"
-        self.regressor = joblib.load(model_path) if Path(model_path).exists() else None
+        
+        if Path(model_path).exists():
+            self.regressor = joblib.load(model_path)
+        else:
+            self.regressor = None

-    def predict_va(self, embedding: np.ndarray):
-        if self.regressor:
-            prediction = self.regressor.predict(embedding.reshape(1, -1))[0]
-            return np.clip(prediction[0], 1.0, 9.0), np.clip(prediction[1], 1.0, 9.0)
-        return 5.0, 5.0
+    def predict_va(self, embedding: np.ndarray) -> tuple[float, float]:
+        # Прогнозирование координат Valence/Arousal по визуальному эмбеддингу
+        if not self.regressor:
+            return 5.0, 5.0
+            
+        raw_prediction = self.regressor.predict(embedding.reshape(1, -1))[0]
+        valence_pred = np.clip(raw_prediction[0], 1.0, 9.0)
+        arousal_pred = np.clip(raw_prediction[1], 1.0, 9.0)
+        
+        return float(valence_pred), float(arousal_pred)

-    def get_audio_path(self, song_id):
-        if not self.audio_dir.exists(): return None
+    def get_audio_path(self, song_id: int | float | str) -> Path | None:
+        # Поиск физического пути к аудиофайлу в зависимости от расширения
+        if not self.audio_dir.exists():
+            return None
+            
        clean_id = str(int(float(song_id)))
        for ext in ['.mp3', '.wav']:
            path = self.audio_dir / f"{clean_id}{ext}"
-            if path.exists(): return path
+            if path.exists():
+                return path
        return None

-    def find_nearest_tracks(self, target_v: float, target_a: float, llm_profile: dict = None, top_k: int = 5):
-        # 1. Эмоциональная дистанция (как и раньше)
-        emo_dist = np.sqrt(
-            1.0 * (self.norm_db['valence'] - target_v)**2 + 
-            2.5 * (self.norm_db['arousal'] - target_a)**2
-        )
-        self.norm_db['emo_distance'] = emo_dist
+    def find_nearest_tracks(self, target_v: float, target_a: float, llm_profile: dict = None, top_k: int = 5) -> pd.DataFrame:
+        # Расчет евклидова расстояния в эмоциональном пространстве Рассела
+        v_dist = (self.norm_db['valence'] - target_v) ** 2
+        a_dist = (self.norm_db['arousal'] - target_a) ** 2
        
-        # Если LLM не дала ответ, сортируем только по эмоциям
+        # Взвешенное расстояние с приоритетом оси активации (Arousal)
+        self.norm_db['emo_distance'] = np.sqrt(1.0 * v_dist + 2.5 * a_dist)
+        
+        # Ранжирование только по эмоциональному критерию при отсутствии профиля LLM
        if not llm_profile:
            self.norm_db['final_score'] = self.norm_db['emo_distance']
            return self.norm_db.sort_values(by='final_score').head(top_k)

-        # 2. Акустическая дистанция (сравниваем треки с запросом LLM)
+        # Расчет отклонений по вектору акустических параметров LLM
        acoustic_penalty = np.zeros(len(self.norm_db))
        for feat in self.acoustic_features:
            if feat in llm_profile:
                target_val = llm_profile[feat]
                acoustic_penalty += np.abs(self.norm_db[f"norm_{feat}"] - target_val)

-        # Усредняем штраф
+        # Нормирование акустической дистанции
        self.norm_db['acoustic_distance'] = acoustic_penalty / len(self.acoustic_features)

-        # 3. Финальный Score (Смесь Эмоций и Акустики). Коэф 4.0 делает акустику важной!
+        # Вычисление интегральной метрики соответствия (мультимодальный скоринг)
        self.norm_db['final_score'] = self.norm_db['emo_distance'] + (self.norm_db['acoustic_distance'] * 4.0)

        return self.norm_db.sort_values(by='final_score').head(top_k)