mirror of
https://github.com/bnair123/MusicAnalyser.git
synced 2026-02-25 11:46:07 +00:00
- Refactor Database: Add `Artist` model, M2M relationship, and `AnalysisSnapshot` model. - Backend Services: Implement `StatsService` for computable metrics and `NarrativeService` for Gemini LLM integration. - Fix Ingestion: Correctly handle multiple artists per track and backfill existing data. - Testing: Add unit tests for statistics logic and live verification scripts. - Documentation: Add `PHASE_4_FRONTEND_GUIDE.md`.
397 lines
14 KiB
Python
397 lines
14 KiB
Python
from sqlalchemy.orm import Session
|
||
from sqlalchemy import func, distinct, desc
|
||
from datetime import datetime, timedelta
|
||
from typing import Dict, Any, List
|
||
import math
|
||
import numpy as np
|
||
|
||
from ..models import PlayHistory, Track, Artist, AnalysisSnapshot
|
||
|
||
class StatsService:
|
||
def __init__(self, db: Session):
|
||
self.db = db
|
||
|
||
def compute_volume_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
|
||
"""
|
||
Calculates volume metrics: Total Plays, Unique Tracks, Artists, etc.
|
||
"""
|
||
query = self.db.query(PlayHistory).filter(
|
||
PlayHistory.played_at >= period_start,
|
||
PlayHistory.played_at <= period_end
|
||
)
|
||
plays = query.all()
|
||
total_plays = len(plays)
|
||
|
||
if total_plays == 0:
|
||
return {
|
||
"total_plays": 0,
|
||
"estimated_minutes": 0,
|
||
"unique_tracks": 0,
|
||
"unique_artists": 0,
|
||
"unique_albums": 0,
|
||
"unique_genres": 0,
|
||
"top_tracks": [],
|
||
"top_artists": [],
|
||
"repeat_rate": 0,
|
||
"concentration": {}
|
||
}
|
||
|
||
# Calculate Duration (Estimated)
|
||
# Note: We query tracks to get duration.
|
||
# Ideally we join, but eager loading might be heavy. Let's do a join or simple loop.
|
||
# Efficient approach: Get all track IDs from plays, fetch Track objects in bulk map.
|
||
|
||
track_ids = [p.track_id for p in plays]
|
||
tracks = self.db.query(Track).filter(Track.id.in_(set(track_ids))).all()
|
||
track_map = {t.id: t for t in tracks}
|
||
|
||
total_ms = 0
|
||
unique_track_ids = set()
|
||
unique_artist_ids = set()
|
||
unique_album_names = set() # Spotify doesn't give album ID in PlayHistory directly unless joined, track has album name string.
|
||
# Ideally track has raw_data['album']['id'].
|
||
unique_album_ids = set()
|
||
|
||
genre_counts = {}
|
||
|
||
# For Top Lists
|
||
track_play_counts = {}
|
||
artist_play_counts = {}
|
||
|
||
for p in plays:
|
||
t = track_map.get(p.track_id)
|
||
if t:
|
||
total_ms += t.duration_ms
|
||
unique_track_ids.add(t.id)
|
||
|
||
# Top Tracks
|
||
track_play_counts[t.id] = track_play_counts.get(t.id, 0) + 1
|
||
|
||
# Artists (using relation)
|
||
# Note: This might cause N+1 query if not eager loaded.
|
||
# For strictly calculation, accessing t.artists (lazy load) loop might be slow for 1000s of plays.
|
||
# Optimization: Join PlayHistory -> Track -> Artist in query.
|
||
|
||
# Let's rely on raw_data for speed if relation loading is slow,
|
||
# OR Assume we accept some latency.
|
||
# Better: Pre-fetch artist connections or use the new tables properly.
|
||
# Let's use the object relation for correctness as per plan.
|
||
for artist in t.artists:
|
||
unique_artist_ids.add(artist.id)
|
||
artist_play_counts[artist.id] = artist_play_counts.get(artist.id, 0) + 1
|
||
|
||
if artist.genres:
|
||
for g in artist.genres:
|
||
genre_counts[g] = genre_counts.get(g, 0) + 1
|
||
|
||
if t.raw_data and "album" in t.raw_data:
|
||
unique_album_ids.add(t.raw_data["album"]["id"])
|
||
else:
|
||
unique_album_ids.add(t.album) # Fallback
|
||
|
||
estimated_minutes = total_ms / 60000
|
||
|
||
# Top 5 Tracks
|
||
sorted_tracks = sorted(track_play_counts.items(), key=lambda x: x[1], reverse=True)[:5]
|
||
top_tracks = []
|
||
for tid, count in sorted_tracks:
|
||
t = track_map.get(tid)
|
||
top_tracks.append({
|
||
"name": t.name,
|
||
"artist": t.artist, # Display string
|
||
"count": count
|
||
})
|
||
|
||
# Top 5 Artists
|
||
# Need to fetch Artist names
|
||
top_artist_ids = sorted(artist_play_counts.items(), key=lambda x: x[1], reverse=True)[:5]
|
||
top_artists_objs = self.db.query(Artist).filter(Artist.id.in_([x[0] for x in top_artist_ids])).all()
|
||
artist_name_map = {a.id: a.name for a in top_artists_objs}
|
||
|
||
top_artists = []
|
||
for aid, count in top_artist_ids:
|
||
top_artists.append({
|
||
"name": artist_name_map.get(aid, "Unknown"),
|
||
"count": count
|
||
})
|
||
|
||
# Top Genres
|
||
sorted_genres = sorted(genre_counts.items(), key=lambda x: x[1], reverse=True)[:5]
|
||
top_genres = [{"name": g, "count": c} for g, c in sorted_genres]
|
||
|
||
# Concentration
|
||
unique_tracks_count = len(unique_track_ids)
|
||
repeat_rate = (total_plays - unique_tracks_count) / total_plays if total_plays > 0 else 0
|
||
|
||
# HHI (Herfindahl–Hirschman Index)
|
||
# Sum of (share)^2. Share = track_plays / total_plays
|
||
hhi = sum([(c/total_plays)**2 for c in track_play_counts.values()])
|
||
|
||
return {
|
||
"total_plays": total_plays,
|
||
"estimated_minutes": int(estimated_minutes),
|
||
"unique_tracks": unique_tracks_count,
|
||
"unique_artists": len(unique_artist_ids),
|
||
"unique_albums": len(unique_album_ids),
|
||
"unique_genres": len(genre_counts),
|
||
"top_tracks": top_tracks,
|
||
"top_artists": top_artists,
|
||
"top_genres": top_genres,
|
||
"repeat_rate": round(repeat_rate, 3),
|
||
"concentration": {
|
||
"hhi": round(hhi, 4),
|
||
# "gini": ... (skip for now to keep it simple)
|
||
}
|
||
}
|
||
|
||
def compute_time_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
|
||
"""
|
||
Hourly, Daily distribution, etc.
|
||
"""
|
||
query = self.db.query(PlayHistory).filter(
|
||
PlayHistory.played_at >= period_start,
|
||
PlayHistory.played_at <= period_end
|
||
)
|
||
plays = query.all()
|
||
|
||
hourly_counts = [0] * 24
|
||
weekday_counts = [0] * 7 # 0=Mon, 6=Sun
|
||
|
||
if not plays:
|
||
return {"hourly_distribution": hourly_counts}
|
||
|
||
for p in plays:
|
||
# played_at is UTC in DB usually. Ensure we handle timezone if user wants local.
|
||
# For now, assuming UTC or system time.
|
||
h = p.played_at.hour
|
||
d = p.played_at.weekday()
|
||
|
||
hourly_counts[h] += 1
|
||
weekday_counts[d] += 1
|
||
|
||
peak_hour = hourly_counts.index(max(hourly_counts))
|
||
|
||
# Weekend Share
|
||
weekend_plays = weekday_counts[5] + weekday_counts[6]
|
||
weekend_share = weekend_plays / len(plays) if len(plays) > 0 else 0
|
||
|
||
return {
|
||
"hourly_distribution": hourly_counts,
|
||
"peak_hour": peak_hour,
|
||
"weekday_distribution": weekday_counts,
|
||
"weekend_share": round(weekend_share, 2)
|
||
}
|
||
|
||
def compute_session_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
|
||
"""
|
||
Session logic: Gap > 20 mins = new session.
|
||
"""
|
||
query = self.db.query(PlayHistory).filter(
|
||
PlayHistory.played_at >= period_start,
|
||
PlayHistory.played_at <= period_end
|
||
).order_by(PlayHistory.played_at.asc())
|
||
plays = query.all()
|
||
|
||
if not plays:
|
||
return {"count": 0, "avg_length_minutes": 0}
|
||
|
||
sessions = []
|
||
current_session = [plays[0]]
|
||
|
||
for i in range(1, len(plays)):
|
||
prev = plays[i-1]
|
||
curr = plays[i]
|
||
diff = (curr.played_at - prev.played_at).total_seconds() / 60
|
||
|
||
if diff > 20:
|
||
sessions.append(current_session)
|
||
current_session = []
|
||
|
||
current_session.append(curr)
|
||
|
||
sessions.append(current_session)
|
||
|
||
session_lengths_min = []
|
||
for sess in sessions:
|
||
if len(sess) > 1:
|
||
start = sess[0].played_at
|
||
end = sess[-1].played_at
|
||
# Add duration of last track?
|
||
# Let's just do (end - start) for simplicity + avg track duration
|
||
duration = (end - start).total_seconds() / 60
|
||
session_lengths_min.append(duration)
|
||
else:
|
||
session_lengths_min.append(3.0) # Approx 1 track
|
||
|
||
avg_min = sum(session_lengths_min) / len(session_lengths_min) if session_lengths_min else 0
|
||
|
||
return {
|
||
"count": len(sessions),
|
||
"avg_tracks": len(plays) / len(sessions),
|
||
"avg_minutes": round(avg_min, 1),
|
||
"longest_session_minutes": round(max(session_lengths_min), 1) if session_lengths_min else 0
|
||
}
|
||
|
||
def compute_vibe_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
|
||
"""
|
||
Aggregates Audio Features (Energy, Valence, etc.)
|
||
"""
|
||
query = self.db.query(PlayHistory).filter(
|
||
PlayHistory.played_at >= period_start,
|
||
PlayHistory.played_at <= period_end
|
||
)
|
||
plays = query.all()
|
||
track_ids = list(set([p.track_id for p in plays]))
|
||
|
||
if not track_ids:
|
||
return {}
|
||
|
||
tracks = self.db.query(Track).filter(Track.id.in_(track_ids)).all()
|
||
|
||
# Collect features
|
||
features = {
|
||
"energy": [], "valence": [], "danceability": [],
|
||
"tempo": [], "acousticness": [], "instrumentalness": [],
|
||
"liveness": [], "speechiness": []
|
||
}
|
||
|
||
for t in tracks:
|
||
# Weight by plays? The spec implies "Per-Period Aggregates".
|
||
# Usually weighted by play count is better representation of what was HEARD.
|
||
# Let's weight by play count in this period.
|
||
play_count = len([p for p in plays if p.track_id == t.id])
|
||
|
||
if t.energy is not None:
|
||
for _ in range(play_count):
|
||
features["energy"].append(t.energy)
|
||
features["valence"].append(t.valence)
|
||
features["danceability"].append(t.danceability)
|
||
features["tempo"].append(t.tempo)
|
||
features["acousticness"].append(t.acousticness)
|
||
features["instrumentalness"].append(t.instrumentalness)
|
||
features["liveness"].append(t.liveness)
|
||
features["speechiness"].append(t.speechiness)
|
||
|
||
stats = {}
|
||
for key, values in features.items():
|
||
valid = [v for v in values if v is not None]
|
||
if valid:
|
||
stats[f"avg_{key}"] = float(np.mean(valid))
|
||
stats[f"std_{key}"] = float(np.std(valid))
|
||
else:
|
||
stats[f"avg_{key}"] = None
|
||
|
||
# Derived Metrics
|
||
if stats.get("avg_energy") and stats.get("avg_valence"):
|
||
stats["mood_quadrant"] = {
|
||
"x": round(stats["avg_valence"], 2),
|
||
"y": round(stats["avg_energy"], 2)
|
||
}
|
||
|
||
return stats
|
||
|
||
def compute_era_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
|
||
"""
|
||
Musical Age and Era Distribution.
|
||
"""
|
||
query = self.db.query(PlayHistory).filter(
|
||
PlayHistory.played_at >= period_start,
|
||
PlayHistory.played_at <= period_end
|
||
)
|
||
plays = query.all()
|
||
|
||
years = []
|
||
track_ids = list(set([p.track_id for p in plays]))
|
||
tracks = self.db.query(Track).filter(Track.id.in_(track_ids)).all()
|
||
track_map = {t.id: t for t in tracks}
|
||
|
||
for p in plays:
|
||
t = track_map.get(p.track_id)
|
||
if t and t.raw_data and "album" in t.raw_data and "release_date" in t.raw_data["album"]:
|
||
rd = t.raw_data["album"]["release_date"]
|
||
# Format can be YYYY, YYYY-MM, YYYY-MM-DD
|
||
try:
|
||
year = int(rd.split("-")[0])
|
||
years.append(year)
|
||
except:
|
||
pass
|
||
|
||
if not years:
|
||
return {"musical_age": None}
|
||
|
||
avg_year = sum(years) / len(years)
|
||
|
||
# Decade breakdown
|
||
decades = {}
|
||
for y in years:
|
||
dec = (y // 10) * 10
|
||
label = f"{dec}s"
|
||
decades[label] = decades.get(label, 0) + 1
|
||
|
||
total = len(years)
|
||
decade_dist = {k: round(v/total, 2) for k, v in decades.items()}
|
||
|
||
return {
|
||
"musical_age": int(avg_year),
|
||
"decade_distribution": decade_dist
|
||
}
|
||
|
||
def compute_skip_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
|
||
"""
|
||
Implements boredom skip detection:
|
||
(next_track.played_at - current_track.played_at) < (current_track.duration_ms / 1000 - 10s)
|
||
"""
|
||
query = self.db.query(PlayHistory).filter(
|
||
PlayHistory.played_at >= period_start,
|
||
PlayHistory.played_at <= period_end
|
||
).order_by(PlayHistory.played_at.asc())
|
||
plays = query.all()
|
||
|
||
if len(plays) < 2:
|
||
return {"skip_rate": 0, "total_skips": 0}
|
||
|
||
skips = 0
|
||
track_ids = list(set([p.track_id for p in plays]))
|
||
tracks = self.db.query(Track).filter(Track.id.in_(track_ids)).all()
|
||
track_map = {t.id: t for t in tracks}
|
||
|
||
for i in range(len(plays) - 1):
|
||
current_play = plays[i]
|
||
next_play = plays[i+1]
|
||
track = track_map.get(current_play.track_id)
|
||
|
||
if not track or not track.duration_ms:
|
||
continue
|
||
|
||
diff_seconds = (next_play.played_at - current_play.played_at).total_seconds()
|
||
|
||
# Logic: If diff < (duration - 10s), it's a skip.
|
||
# Convert duration to seconds
|
||
duration_sec = track.duration_ms / 1000.0
|
||
|
||
# Also ensure diff isn't negative or weirdly small (re-plays)
|
||
# And assume "listening" means diff > 30s at least?
|
||
# Spec says "Spotify only returns 30s+".
|
||
|
||
if diff_seconds < (duration_sec - 10):
|
||
skips += 1
|
||
|
||
return {
|
||
"total_skips": skips,
|
||
"skip_rate": round(skips / len(plays), 3)
|
||
}
|
||
|
||
def generate_full_report(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
|
||
return {
|
||
"period": {
|
||
"start": period_start.isoformat(),
|
||
"end": period_end.isoformat()
|
||
},
|
||
"volume": self.compute_volume_stats(period_start, period_end),
|
||
"time_habits": self.compute_time_stats(period_start, period_end),
|
||
"sessions": self.compute_session_stats(period_start, period_end),
|
||
"vibe": self.compute_vibe_stats(period_start, period_end),
|
||
"era": self.compute_era_stats(period_start, period_end),
|
||
"skips": self.compute_skip_stats(period_start, period_end)
|
||
}
|