Implement Phase 3 Music Analysis and LLM Engine

- Refactor Database: Add `Artist` model, M2M relationship, and `AnalysisSnapshot` model. - Backend Services: Implement `StatsService` for computable metrics and `NarrativeService` for Gemini LLM integration. - Fix Ingestion: Correctly handle multiple artists per track and backfill existing data. - Testing: Add unit tests for statistics logic and live verification scripts. - Documentation: Add `PHASE_4_FRONTEND_GUIDE.md`.
2026-02-25 11:46:07 +00:00 · 2025-12-24 23:16:32 +00:00
parent ab47dd62ca
commit f4432154b6
9 changed files with 942 additions and 30 deletions
--- a/backend/app/ingest.py
+++ b/backend/app/ingest.py
@@ -2,7 +2,7 @@ import asyncio
 import os
 from datetime import datetime
 from sqlalchemy.orm import Session
-from .models import Track, PlayHistory
+from .models import Track, PlayHistory, Artist
 from .database import SessionLocal
 from .services.spotify_client import SpotifyClient
 from .services.reccobeats_client import ReccoBeatsClient
@@ -19,9 +19,32 @@ def get_spotify_client():
 def get_reccobeats_client():
    return ReccoBeatsClient()

+async def ensure_artists_exist(db: Session, artists_data: list):
+    """
+    Ensures that all artists in the list exist in the Artist table.
+    Returns a list of Artist objects.
+    """
+    artist_objects = []
+    for a_data in artists_data:
+        artist_id = a_data["id"]
+        artist = db.query(Artist).filter(Artist.id == artist_id).first()
+        if not artist:
+            artist = Artist(
+                id=artist_id,
+                name=a_data["name"],
+                genres=[] # Will be enriched later
+            )
+            db.add(artist)
+            # We commit inside the loop or after, but for now we rely on the main commit
+            # However, to return the object correctly we might need to flush if we were doing complex things,
+            # but here adding to session is enough for SQLAlchemy to track it.
+        artist_objects.append(artist)
+    return artist_objects
+
 async def enrich_tracks(db: Session, spotify_client: SpotifyClient, recco_client: ReccoBeatsClient):
    """
    Finds tracks missing genres (Spotify) or audio features (ReccoBeats) and enriches them.
+    Also enriches Artists with genres.
    """

    # 1. Enrich Audio Features (via ReccoBeats)
@@ -66,39 +89,35 @@ async def enrich_tracks(db: Session, spotify_client: SpotifyClient, recco_client
        print(f"Updated {updated_count} tracks with audio features.")
        db.commit()

-    # 2. Enrich Genres (via Spotify Artists)
-    tracks_missing_genres = db.query(Track).filter(Track.genres == None).limit(50).all()
+    # 2. Enrich Artist Genres (via Spotify Artists)
+    # We look for artists who have no genres. Note: an artist might genuinely have no genres,
+    # so we might need a flag "genres_checked" in the future, but for now checking empty list is okay.
+    # However, newly created artists have genres=[] (empty list) or None?
+    # My model definition: genres = Column(JSON, nullable=True)
+    # So if it is None, we haven't fetched it.

-    if tracks_missing_genres:
-        print(f"Enriching {len(tracks_missing_genres)} tracks with genres (Spotify)...")
+    artists_missing_genres = db.query(Artist).filter(Artist.genres == None).limit(50).all()

-        artist_ids = set()
-        track_artist_map = {}
-
-        for t in tracks_missing_genres:
-            if t.raw_data and "artists" in t.raw_data:
-                a_ids = [a["id"] for a in t.raw_data["artists"]]
-                artist_ids.update(a_ids)
-                track_artist_map[t.id] = a_ids
-
-        artist_ids_list = list(artist_ids)
-        artist_genre_map = {}
+    if artists_missing_genres:
+        print(f"Enriching {len(artists_missing_genres)} artists with genres (Spotify)...")
+        artist_ids_list = [a.id for a in artists_missing_genres]

+        artist_data_map = {}
+        # Spotify allows fetching 50 artists at a time
        for i in range(0, len(artist_ids_list), 50):
            chunk = artist_ids_list[i:i+50]
            artists_data = await spotify_client.get_artists(chunk)
            for a_data in artists_data:
                if a_data:
-                    artist_genre_map[a_data["id"]] = a_data.get("genres", [])
+                    artist_data_map[a_data["id"]] = a_data.get("genres", [])

-        for t in tracks_missing_genres:
-            a_ids = track_artist_map.get(t.id, [])
-            combined_genres = set()
-            for a_id in a_ids:
-                genres = artist_genre_map.get(a_id, [])
-                combined_genres.update(genres)
-
-            t.genres = list(combined_genres)
+        for artist in artists_missing_genres:
+            genres = artist_data_map.get(artist.id)
+            if genres is not None:
+                artist.genres = genres
+            else:
+                # If we couldn't fetch, set to empty list so we don't keep retrying forever (or handle errors better)
+                artist.genres = []

        db.commit()

@@ -128,15 +147,30 @@ async def ingest_recently_played(db: Session):
            track = Track(
                id=track_id,
                name=track_data["name"],
-                artist=", ".join([a["name"] for a in track_data["artists"]]),
+                artist=", ".join([a["name"] for a in track_data["artists"]]), # Legacy string
                album=track_data["album"]["name"],
                duration_ms=track_data["duration_ms"],
                popularity=track_data["popularity"],
                raw_data=track_data
            )
+
+            # Handle Artists Relation
+            artists_data = track_data.get("artists", [])
+            artist_objects = await ensure_artists_exist(db, artists_data)
+            track.artists = artist_objects
+
            db.add(track)
            db.commit()

+        # Ensure relationships exist even if track existed (e.g. migration)
+        # Check if track has artists linked. If not (and raw_data has them), link them.
+        # FIX: Logic was previously indented improperly inside `if not track`.
+        if not track.artists and track.raw_data and "artists" in track.raw_data:
+             print(f"Backfilling artists for track {track.name}")
+             artist_objects = await ensure_artists_exist(db, track.raw_data["artists"])
+             track.artists = artist_objects
+             db.commit()
+
        exists = db.query(PlayHistory).filter(
            PlayHistory.track_id == track_id,
            PlayHistory.played_at == played_at