Implement Phase 3 Music Analysis and LLM Engine

- Refactor Database: Add `Artist` model, M2M relationship, and `AnalysisSnapshot` model. - Backend Services: Implement `StatsService` for computable metrics and `NarrativeService` for Gemini LLM integration. - Fix Ingestion: Correctly handle multiple artists per track and backfill existing data. - Testing: Add unit tests for statistics logic and live verification scripts. - Documentation: Add `PHASE_4_FRONTEND_GUIDE.md`.
2026-02-25 11:46:07 +00:00 · 2025-12-24 23:16:32 +00:00
parent ab47dd62ca
commit f4432154b6
9 changed files with 942 additions and 30 deletions
--- a/backend/app/services/stats_service.py
+++ b/backend/app/services/stats_service.py
@@ -0,0 +1,396 @@
+from sqlalchemy.orm import Session
+from sqlalchemy import func, distinct, desc
+from datetime import datetime, timedelta
+from typing import Dict, Any, List
+import math
+import numpy as np
+
+from ..models import PlayHistory, Track, Artist, AnalysisSnapshot
+
+class StatsService:
+    def __init__(self, db: Session):
+        self.db = db
+
+    def compute_volume_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
+        """
+        Calculates volume metrics: Total Plays, Unique Tracks, Artists, etc.
+        """
+        query = self.db.query(PlayHistory).filter(
+            PlayHistory.played_at >= period_start,
+            PlayHistory.played_at <= period_end
+        )
+        plays = query.all()
+        total_plays = len(plays)
+
+        if total_plays == 0:
+            return {
+                "total_plays": 0,
+                "estimated_minutes": 0,
+                "unique_tracks": 0,
+                "unique_artists": 0,
+                "unique_albums": 0,
+                "unique_genres": 0,
+                "top_tracks": [],
+                "top_artists": [],
+                "repeat_rate": 0,
+                "concentration": {}
+            }
+
+        # Calculate Duration (Estimated)
+        # Note: We query tracks to get duration.
+        # Ideally we join, but eager loading might be heavy. Let's do a join or simple loop.
+        # Efficient approach: Get all track IDs from plays, fetch Track objects in bulk map.
+
+        track_ids = [p.track_id for p in plays]
+        tracks = self.db.query(Track).filter(Track.id.in_(set(track_ids))).all()
+        track_map = {t.id: t for t in tracks}
+
+        total_ms = 0
+        unique_track_ids = set()
+        unique_artist_ids = set()
+        unique_album_names = set() # Spotify doesn't give album ID in PlayHistory directly unless joined, track has album name string.
+        # Ideally track has raw_data['album']['id'].
+        unique_album_ids = set()
+
+        genre_counts = {}
+
+        # For Top Lists
+        track_play_counts = {}
+        artist_play_counts = {}
+
+        for p in plays:
+            t = track_map.get(p.track_id)
+            if t:
+                total_ms += t.duration_ms
+                unique_track_ids.add(t.id)
+
+                # Top Tracks
+                track_play_counts[t.id] = track_play_counts.get(t.id, 0) + 1
+
+                # Artists (using relation)
+                # Note: This might cause N+1 query if not eager loaded.
+                # For strictly calculation, accessing t.artists (lazy load) loop might be slow for 1000s of plays.
+                # Optimization: Join PlayHistory -> Track -> Artist in query.
+
+                # Let's rely on raw_data for speed if relation loading is slow,
+                # OR Assume we accept some latency.
+                # Better: Pre-fetch artist connections or use the new tables properly.
+                # Let's use the object relation for correctness as per plan.
+                for artist in t.artists:
+                    unique_artist_ids.add(artist.id)
+                    artist_play_counts[artist.id] = artist_play_counts.get(artist.id, 0) + 1
+
+                    if artist.genres:
+                        for g in artist.genres:
+                            genre_counts[g] = genre_counts.get(g, 0) + 1
+
+                if t.raw_data and "album" in t.raw_data:
+                    unique_album_ids.add(t.raw_data["album"]["id"])
+                else:
+                    unique_album_ids.add(t.album) # Fallback
+
+        estimated_minutes = total_ms / 60000
+
+        # Top 5 Tracks
+        sorted_tracks = sorted(track_play_counts.items(), key=lambda x: x[1], reverse=True)[:5]
+        top_tracks = []
+        for tid, count in sorted_tracks:
+            t = track_map.get(tid)
+            top_tracks.append({
+                "name": t.name,
+                "artist": t.artist, # Display string
+                "count": count
+            })
+
+        # Top 5 Artists
+        # Need to fetch Artist names
+        top_artist_ids = sorted(artist_play_counts.items(), key=lambda x: x[1], reverse=True)[:5]
+        top_artists_objs = self.db.query(Artist).filter(Artist.id.in_([x[0] for x in top_artist_ids])).all()
+        artist_name_map = {a.id: a.name for a in top_artists_objs}
+
+        top_artists = []
+        for aid, count in top_artist_ids:
+            top_artists.append({
+                "name": artist_name_map.get(aid, "Unknown"),
+                "count": count
+            })
+
+        # Top Genres
+        sorted_genres = sorted(genre_counts.items(), key=lambda x: x[1], reverse=True)[:5]
+        top_genres = [{"name": g, "count": c} for g, c in sorted_genres]
+
+        # Concentration
+        unique_tracks_count = len(unique_track_ids)
+        repeat_rate = (total_plays - unique_tracks_count) / total_plays if total_plays > 0 else 0
+
+        # HHI (Herfindahl–Hirschman Index)
+        # Sum of (share)^2. Share = track_plays / total_plays
+        hhi = sum([(c/total_plays)**2 for c in track_play_counts.values()])
+
+        return {
+            "total_plays": total_plays,
+            "estimated_minutes": int(estimated_minutes),
+            "unique_tracks": unique_tracks_count,
+            "unique_artists": len(unique_artist_ids),
+            "unique_albums": len(unique_album_ids),
+            "unique_genres": len(genre_counts),
+            "top_tracks": top_tracks,
+            "top_artists": top_artists,
+            "top_genres": top_genres,
+            "repeat_rate": round(repeat_rate, 3),
+            "concentration": {
+                "hhi": round(hhi, 4),
+                # "gini": ... (skip for now to keep it simple)
+            }
+        }
+
+    def compute_time_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
+        """
+        Hourly, Daily distribution, etc.
+        """
+        query = self.db.query(PlayHistory).filter(
+            PlayHistory.played_at >= period_start,
+            PlayHistory.played_at <= period_end
+        )
+        plays = query.all()
+
+        hourly_counts = [0] * 24
+        weekday_counts = [0] * 7 # 0=Mon, 6=Sun
+
+        if not plays:
+             return {"hourly_distribution": hourly_counts}
+
+        for p in plays:
+            # played_at is UTC in DB usually. Ensure we handle timezone if user wants local.
+            # For now, assuming UTC or system time.
+            h = p.played_at.hour
+            d = p.played_at.weekday()
+
+            hourly_counts[h] += 1
+            weekday_counts[d] += 1
+
+        peak_hour = hourly_counts.index(max(hourly_counts))
+
+        # Weekend Share
+        weekend_plays = weekday_counts[5] + weekday_counts[6]
+        weekend_share = weekend_plays / len(plays) if len(plays) > 0 else 0
+
+        return {
+            "hourly_distribution": hourly_counts,
+            "peak_hour": peak_hour,
+            "weekday_distribution": weekday_counts,
+            "weekend_share": round(weekend_share, 2)
+        }
+
+    def compute_session_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
+        """
+        Session logic: Gap > 20 mins = new session.
+        """
+        query = self.db.query(PlayHistory).filter(
+            PlayHistory.played_at >= period_start,
+            PlayHistory.played_at <= period_end
+        ).order_by(PlayHistory.played_at.asc())
+        plays = query.all()
+
+        if not plays:
+            return {"count": 0, "avg_length_minutes": 0}
+
+        sessions = []
+        current_session = [plays[0]]
+
+        for i in range(1, len(plays)):
+            prev = plays[i-1]
+            curr = plays[i]
+            diff = (curr.played_at - prev.played_at).total_seconds() / 60
+
+            if diff > 20:
+                sessions.append(current_session)
+                current_session = []
+
+            current_session.append(curr)
+
+        sessions.append(current_session)
+
+        session_lengths_min = []
+        for sess in sessions:
+            if len(sess) > 1:
+                start = sess[0].played_at
+                end = sess[-1].played_at
+                # Add duration of last track?
+                # Let's just do (end - start) for simplicity + avg track duration
+                duration = (end - start).total_seconds() / 60
+                session_lengths_min.append(duration)
+            else:
+                session_lengths_min.append(3.0) # Approx 1 track
+
+        avg_min = sum(session_lengths_min) / len(session_lengths_min) if session_lengths_min else 0
+
+        return {
+            "count": len(sessions),
+            "avg_tracks": len(plays) / len(sessions),
+            "avg_minutes": round(avg_min, 1),
+            "longest_session_minutes": round(max(session_lengths_min), 1) if session_lengths_min else 0
+        }
+
+    def compute_vibe_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
+        """
+        Aggregates Audio Features (Energy, Valence, etc.)
+        """
+        query = self.db.query(PlayHistory).filter(
+            PlayHistory.played_at >= period_start,
+            PlayHistory.played_at <= period_end
+        )
+        plays = query.all()
+        track_ids = list(set([p.track_id for p in plays]))
+
+        if not track_ids:
+            return {}
+
+        tracks = self.db.query(Track).filter(Track.id.in_(track_ids)).all()
+
+        # Collect features
+        features = {
+            "energy": [], "valence": [], "danceability": [],
+            "tempo": [], "acousticness": [], "instrumentalness": [],
+            "liveness": [], "speechiness": []
+        }
+
+        for t in tracks:
+            # Weight by plays? The spec implies "Per-Period Aggregates".
+            # Usually weighted by play count is better representation of what was HEARD.
+            # Let's weight by play count in this period.
+            play_count = len([p for p in plays if p.track_id == t.id])
+
+            if t.energy is not None:
+                for _ in range(play_count):
+                    features["energy"].append(t.energy)
+                    features["valence"].append(t.valence)
+                    features["danceability"].append(t.danceability)
+                    features["tempo"].append(t.tempo)
+                    features["acousticness"].append(t.acousticness)
+                    features["instrumentalness"].append(t.instrumentalness)
+                    features["liveness"].append(t.liveness)
+                    features["speechiness"].append(t.speechiness)
+
+        stats = {}
+        for key, values in features.items():
+            valid = [v for v in values if v is not None]
+            if valid:
+                stats[f"avg_{key}"] = float(np.mean(valid))
+                stats[f"std_{key}"] = float(np.std(valid))
+            else:
+                stats[f"avg_{key}"] = None
+
+        # Derived Metrics
+        if stats.get("avg_energy") and stats.get("avg_valence"):
+            stats["mood_quadrant"] = {
+                "x": round(stats["avg_valence"], 2),
+                "y": round(stats["avg_energy"], 2)
+            }
+
+        return stats
+
+    def compute_era_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
+        """
+        Musical Age and Era Distribution.
+        """
+        query = self.db.query(PlayHistory).filter(
+            PlayHistory.played_at >= period_start,
+            PlayHistory.played_at <= period_end
+        )
+        plays = query.all()
+
+        years = []
+        track_ids = list(set([p.track_id for p in plays]))
+        tracks = self.db.query(Track).filter(Track.id.in_(track_ids)).all()
+        track_map = {t.id: t for t in tracks}
+
+        for p in plays:
+            t = track_map.get(p.track_id)
+            if t and t.raw_data and "album" in t.raw_data and "release_date" in t.raw_data["album"]:
+                rd = t.raw_data["album"]["release_date"]
+                # Format can be YYYY, YYYY-MM, YYYY-MM-DD
+                try:
+                    year = int(rd.split("-")[0])
+                    years.append(year)
+                except:
+                    pass
+
+        if not years:
+            return {"musical_age": None}
+
+        avg_year = sum(years) / len(years)
+
+        # Decade breakdown
+        decades = {}
+        for y in years:
+            dec = (y // 10) * 10
+            label = f"{dec}s"
+            decades[label] = decades.get(label, 0) + 1
+
+        total = len(years)
+        decade_dist = {k: round(v/total, 2) for k, v in decades.items()}
+
+        return {
+            "musical_age": int(avg_year),
+            "decade_distribution": decade_dist
+        }
+
+    def compute_skip_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
+        """
+        Implements boredom skip detection:
+        (next_track.played_at - current_track.played_at) < (current_track.duration_ms / 1000 - 10s)
+        """
+        query = self.db.query(PlayHistory).filter(
+            PlayHistory.played_at >= period_start,
+            PlayHistory.played_at <= period_end
+        ).order_by(PlayHistory.played_at.asc())
+        plays = query.all()
+
+        if len(plays) < 2:
+            return {"skip_rate": 0, "total_skips": 0}
+
+        skips = 0
+        track_ids = list(set([p.track_id for p in plays]))
+        tracks = self.db.query(Track).filter(Track.id.in_(track_ids)).all()
+        track_map = {t.id: t for t in tracks}
+
+        for i in range(len(plays) - 1):
+            current_play = plays[i]
+            next_play = plays[i+1]
+            track = track_map.get(current_play.track_id)
+
+            if not track or not track.duration_ms:
+                continue
+
+            diff_seconds = (next_play.played_at - current_play.played_at).total_seconds()
+
+            # Logic: If diff < (duration - 10s), it's a skip.
+            # Convert duration to seconds
+            duration_sec = track.duration_ms / 1000.0
+
+            # Also ensure diff isn't negative or weirdly small (re-plays)
+            # And assume "listening" means diff > 30s at least?
+            # Spec says "Spotify only returns 30s+".
+
+            if diff_seconds < (duration_sec - 10):
+                skips += 1
+
+        return {
+            "total_skips": skips,
+            "skip_rate": round(skips / len(plays), 3)
+        }
+
+    def generate_full_report(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
+        return {
+            "period": {
+                "start": period_start.isoformat(),
+                "end": period_end.isoformat()
+            },
+            "volume": self.compute_volume_stats(period_start, period_end),
+            "time_habits": self.compute_time_stats(period_start, period_end),
+            "sessions": self.compute_session_stats(period_start, period_end),
+            "vibe": self.compute_vibe_stats(period_start, period_end),
+            "era": self.compute_era_stats(period_start, period_end),
+            "skips": self.compute_skip_stats(period_start, period_end)
+        }