Files
MusicAnalyser/backend/app/services/stats_service.py
google-labs-jules[bot] f4432154b6 Implement Phase 3 Music Analysis and LLM Engine
- Refactor Database: Add `Artist` model, M2M relationship, and `AnalysisSnapshot` model.
- Backend Services: Implement `StatsService` for computable metrics and `NarrativeService` for Gemini LLM integration.
- Fix Ingestion: Correctly handle multiple artists per track and backfill existing data.
- Testing: Add unit tests for statistics logic and live verification scripts.
- Documentation: Add `PHASE_4_FRONTEND_GUIDE.md`.
2025-12-24 23:16:32 +00:00

397 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
from sqlalchemy.orm import Session
from sqlalchemy import func, distinct, desc
from datetime import datetime, timedelta
from typing import Dict, Any, List
import math
import numpy as np
from ..models import PlayHistory, Track, Artist, AnalysisSnapshot
class StatsService:
def __init__(self, db: Session):
self.db = db
def compute_volume_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
"""
Calculates volume metrics: Total Plays, Unique Tracks, Artists, etc.
"""
query = self.db.query(PlayHistory).filter(
PlayHistory.played_at >= period_start,
PlayHistory.played_at <= period_end
)
plays = query.all()
total_plays = len(plays)
if total_plays == 0:
return {
"total_plays": 0,
"estimated_minutes": 0,
"unique_tracks": 0,
"unique_artists": 0,
"unique_albums": 0,
"unique_genres": 0,
"top_tracks": [],
"top_artists": [],
"repeat_rate": 0,
"concentration": {}
}
# Calculate Duration (Estimated)
# Note: We query tracks to get duration.
# Ideally we join, but eager loading might be heavy. Let's do a join or simple loop.
# Efficient approach: Get all track IDs from plays, fetch Track objects in bulk map.
track_ids = [p.track_id for p in plays]
tracks = self.db.query(Track).filter(Track.id.in_(set(track_ids))).all()
track_map = {t.id: t for t in tracks}
total_ms = 0
unique_track_ids = set()
unique_artist_ids = set()
unique_album_names = set() # Spotify doesn't give album ID in PlayHistory directly unless joined, track has album name string.
# Ideally track has raw_data['album']['id'].
unique_album_ids = set()
genre_counts = {}
# For Top Lists
track_play_counts = {}
artist_play_counts = {}
for p in plays:
t = track_map.get(p.track_id)
if t:
total_ms += t.duration_ms
unique_track_ids.add(t.id)
# Top Tracks
track_play_counts[t.id] = track_play_counts.get(t.id, 0) + 1
# Artists (using relation)
# Note: This might cause N+1 query if not eager loaded.
# For strictly calculation, accessing t.artists (lazy load) loop might be slow for 1000s of plays.
# Optimization: Join PlayHistory -> Track -> Artist in query.
# Let's rely on raw_data for speed if relation loading is slow,
# OR Assume we accept some latency.
# Better: Pre-fetch artist connections or use the new tables properly.
# Let's use the object relation for correctness as per plan.
for artist in t.artists:
unique_artist_ids.add(artist.id)
artist_play_counts[artist.id] = artist_play_counts.get(artist.id, 0) + 1
if artist.genres:
for g in artist.genres:
genre_counts[g] = genre_counts.get(g, 0) + 1
if t.raw_data and "album" in t.raw_data:
unique_album_ids.add(t.raw_data["album"]["id"])
else:
unique_album_ids.add(t.album) # Fallback
estimated_minutes = total_ms / 60000
# Top 5 Tracks
sorted_tracks = sorted(track_play_counts.items(), key=lambda x: x[1], reverse=True)[:5]
top_tracks = []
for tid, count in sorted_tracks:
t = track_map.get(tid)
top_tracks.append({
"name": t.name,
"artist": t.artist, # Display string
"count": count
})
# Top 5 Artists
# Need to fetch Artist names
top_artist_ids = sorted(artist_play_counts.items(), key=lambda x: x[1], reverse=True)[:5]
top_artists_objs = self.db.query(Artist).filter(Artist.id.in_([x[0] for x in top_artist_ids])).all()
artist_name_map = {a.id: a.name for a in top_artists_objs}
top_artists = []
for aid, count in top_artist_ids:
top_artists.append({
"name": artist_name_map.get(aid, "Unknown"),
"count": count
})
# Top Genres
sorted_genres = sorted(genre_counts.items(), key=lambda x: x[1], reverse=True)[:5]
top_genres = [{"name": g, "count": c} for g, c in sorted_genres]
# Concentration
unique_tracks_count = len(unique_track_ids)
repeat_rate = (total_plays - unique_tracks_count) / total_plays if total_plays > 0 else 0
# HHI (HerfindahlHirschman Index)
# Sum of (share)^2. Share = track_plays / total_plays
hhi = sum([(c/total_plays)**2 for c in track_play_counts.values()])
return {
"total_plays": total_plays,
"estimated_minutes": int(estimated_minutes),
"unique_tracks": unique_tracks_count,
"unique_artists": len(unique_artist_ids),
"unique_albums": len(unique_album_ids),
"unique_genres": len(genre_counts),
"top_tracks": top_tracks,
"top_artists": top_artists,
"top_genres": top_genres,
"repeat_rate": round(repeat_rate, 3),
"concentration": {
"hhi": round(hhi, 4),
# "gini": ... (skip for now to keep it simple)
}
}
def compute_time_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
"""
Hourly, Daily distribution, etc.
"""
query = self.db.query(PlayHistory).filter(
PlayHistory.played_at >= period_start,
PlayHistory.played_at <= period_end
)
plays = query.all()
hourly_counts = [0] * 24
weekday_counts = [0] * 7 # 0=Mon, 6=Sun
if not plays:
return {"hourly_distribution": hourly_counts}
for p in plays:
# played_at is UTC in DB usually. Ensure we handle timezone if user wants local.
# For now, assuming UTC or system time.
h = p.played_at.hour
d = p.played_at.weekday()
hourly_counts[h] += 1
weekday_counts[d] += 1
peak_hour = hourly_counts.index(max(hourly_counts))
# Weekend Share
weekend_plays = weekday_counts[5] + weekday_counts[6]
weekend_share = weekend_plays / len(plays) if len(plays) > 0 else 0
return {
"hourly_distribution": hourly_counts,
"peak_hour": peak_hour,
"weekday_distribution": weekday_counts,
"weekend_share": round(weekend_share, 2)
}
def compute_session_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
"""
Session logic: Gap > 20 mins = new session.
"""
query = self.db.query(PlayHistory).filter(
PlayHistory.played_at >= period_start,
PlayHistory.played_at <= period_end
).order_by(PlayHistory.played_at.asc())
plays = query.all()
if not plays:
return {"count": 0, "avg_length_minutes": 0}
sessions = []
current_session = [plays[0]]
for i in range(1, len(plays)):
prev = plays[i-1]
curr = plays[i]
diff = (curr.played_at - prev.played_at).total_seconds() / 60
if diff > 20:
sessions.append(current_session)
current_session = []
current_session.append(curr)
sessions.append(current_session)
session_lengths_min = []
for sess in sessions:
if len(sess) > 1:
start = sess[0].played_at
end = sess[-1].played_at
# Add duration of last track?
# Let's just do (end - start) for simplicity + avg track duration
duration = (end - start).total_seconds() / 60
session_lengths_min.append(duration)
else:
session_lengths_min.append(3.0) # Approx 1 track
avg_min = sum(session_lengths_min) / len(session_lengths_min) if session_lengths_min else 0
return {
"count": len(sessions),
"avg_tracks": len(plays) / len(sessions),
"avg_minutes": round(avg_min, 1),
"longest_session_minutes": round(max(session_lengths_min), 1) if session_lengths_min else 0
}
def compute_vibe_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
"""
Aggregates Audio Features (Energy, Valence, etc.)
"""
query = self.db.query(PlayHistory).filter(
PlayHistory.played_at >= period_start,
PlayHistory.played_at <= period_end
)
plays = query.all()
track_ids = list(set([p.track_id for p in plays]))
if not track_ids:
return {}
tracks = self.db.query(Track).filter(Track.id.in_(track_ids)).all()
# Collect features
features = {
"energy": [], "valence": [], "danceability": [],
"tempo": [], "acousticness": [], "instrumentalness": [],
"liveness": [], "speechiness": []
}
for t in tracks:
# Weight by plays? The spec implies "Per-Period Aggregates".
# Usually weighted by play count is better representation of what was HEARD.
# Let's weight by play count in this period.
play_count = len([p for p in plays if p.track_id == t.id])
if t.energy is not None:
for _ in range(play_count):
features["energy"].append(t.energy)
features["valence"].append(t.valence)
features["danceability"].append(t.danceability)
features["tempo"].append(t.tempo)
features["acousticness"].append(t.acousticness)
features["instrumentalness"].append(t.instrumentalness)
features["liveness"].append(t.liveness)
features["speechiness"].append(t.speechiness)
stats = {}
for key, values in features.items():
valid = [v for v in values if v is not None]
if valid:
stats[f"avg_{key}"] = float(np.mean(valid))
stats[f"std_{key}"] = float(np.std(valid))
else:
stats[f"avg_{key}"] = None
# Derived Metrics
if stats.get("avg_energy") and stats.get("avg_valence"):
stats["mood_quadrant"] = {
"x": round(stats["avg_valence"], 2),
"y": round(stats["avg_energy"], 2)
}
return stats
def compute_era_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
"""
Musical Age and Era Distribution.
"""
query = self.db.query(PlayHistory).filter(
PlayHistory.played_at >= period_start,
PlayHistory.played_at <= period_end
)
plays = query.all()
years = []
track_ids = list(set([p.track_id for p in plays]))
tracks = self.db.query(Track).filter(Track.id.in_(track_ids)).all()
track_map = {t.id: t for t in tracks}
for p in plays:
t = track_map.get(p.track_id)
if t and t.raw_data and "album" in t.raw_data and "release_date" in t.raw_data["album"]:
rd = t.raw_data["album"]["release_date"]
# Format can be YYYY, YYYY-MM, YYYY-MM-DD
try:
year = int(rd.split("-")[0])
years.append(year)
except:
pass
if not years:
return {"musical_age": None}
avg_year = sum(years) / len(years)
# Decade breakdown
decades = {}
for y in years:
dec = (y // 10) * 10
label = f"{dec}s"
decades[label] = decades.get(label, 0) + 1
total = len(years)
decade_dist = {k: round(v/total, 2) for k, v in decades.items()}
return {
"musical_age": int(avg_year),
"decade_distribution": decade_dist
}
def compute_skip_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
"""
Implements boredom skip detection:
(next_track.played_at - current_track.played_at) < (current_track.duration_ms / 1000 - 10s)
"""
query = self.db.query(PlayHistory).filter(
PlayHistory.played_at >= period_start,
PlayHistory.played_at <= period_end
).order_by(PlayHistory.played_at.asc())
plays = query.all()
if len(plays) < 2:
return {"skip_rate": 0, "total_skips": 0}
skips = 0
track_ids = list(set([p.track_id for p in plays]))
tracks = self.db.query(Track).filter(Track.id.in_(track_ids)).all()
track_map = {t.id: t for t in tracks}
for i in range(len(plays) - 1):
current_play = plays[i]
next_play = plays[i+1]
track = track_map.get(current_play.track_id)
if not track or not track.duration_ms:
continue
diff_seconds = (next_play.played_at - current_play.played_at).total_seconds()
# Logic: If diff < (duration - 10s), it's a skip.
# Convert duration to seconds
duration_sec = track.duration_ms / 1000.0
# Also ensure diff isn't negative or weirdly small (re-plays)
# And assume "listening" means diff > 30s at least?
# Spec says "Spotify only returns 30s+".
if diff_seconds < (duration_sec - 10):
skips += 1
return {
"total_skips": skips,
"skip_rate": round(skips / len(plays), 3)
}
def generate_full_report(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
return {
"period": {
"start": period_start.isoformat(),
"end": period_end.isoformat()
},
"volume": self.compute_volume_stats(period_start, period_end),
"time_habits": self.compute_time_stats(period_start, period_end),
"sessions": self.compute_session_stats(period_start, period_end),
"vibe": self.compute_vibe_stats(period_start, period_end),
"era": self.compute_era_stats(period_start, period_end),
"skips": self.compute_skip_stats(period_start, period_end)
}