From 508d001d7e16dafa4405612e8552420dcca719d1 Mon Sep 17 00:00:00 2001 From: bnair123 Date: Thu, 25 Dec 2025 17:48:41 +0400 Subject: [PATCH] Fixed and added all the stats_service.py methods --- backend/app/main.py | 80 ++- backend/app/services/narrative_service.py | 50 +- backend/app/services/stats_service.py | 679 +++++++++++++++------- backend/run_scheduler.py | 16 + 4 files changed, 580 insertions(+), 245 deletions(-) create mode 100644 backend/run_scheduler.py diff --git a/backend/app/main.py b/backend/app/main.py index c0ba4dd..d7c96b9 100644 --- a/backend/app/main.py +++ b/backend/app/main.py @@ -1,12 +1,15 @@ -from fastapi import FastAPI, Depends +from fastapi import FastAPI, Depends, HTTPException, BackgroundTasks from sqlalchemy.orm import Session +from datetime import datetime, timedelta +from typing import List, Optional +from dotenv import load_dotenv + from .database import engine, Base, get_db -from .models import PlayHistory as PlayHistoryModel, Track as TrackModel +from .models import PlayHistory as PlayHistoryModel, Track as TrackModel, AnalysisSnapshot from . import schemas from .ingest import ingest_recently_played -import asyncio -from typing import List -from dotenv import load_dotenv +from .services.stats_service import StatsService +from .services.narrative_service import NarrativeService load_dotenv() @@ -24,13 +27,68 @@ def get_history(limit: int = 50, db: Session = Depends(get_db)): history = db.query(PlayHistoryModel).order_by(PlayHistoryModel.played_at.desc()).limit(limit).all() return history -@app.post("/trigger-ingest") -async def trigger_ingest(db: Session = Depends(get_db)): - """Manually trigger the ingestion process (useful for testing)""" - await ingest_recently_played(db) - return {"status": "Ingestion triggered"} - @app.get("/tracks", response_model=List[schemas.Track]) def get_tracks(limit: int = 50, db: Session = Depends(get_db)): tracks = db.query(TrackModel).limit(limit).all() return tracks + +@app.post("/trigger-ingest") +async def trigger_ingest(background_tasks: BackgroundTasks, db: Session = Depends(get_db)): + """Triggers Spotify ingestion in the background.""" + background_tasks.add_task(ingest_recently_played, db) + return {"status": "Ingestion started in background"} + +@app.post("/trigger-analysis") +def trigger_analysis( + days: int = 30, + model_name: str = "gemini-2.5-flash", + db: Session = Depends(get_db) +): + """ + Runs the full analysis pipeline (Stats + LLM) for the last X days. + Returns the computed metrics and narrative immediately. + """ + try: + end_date = datetime.utcnow() + start_date = end_date - timedelta(days=days) + + # 1. Compute Stats + stats_service = StatsService(db) + stats_json = stats_service.generate_full_report(start_date, end_date) + + if stats_json["volume"]["total_plays"] == 0: + raise HTTPException(status_code=404, detail="No plays found in the specified period.") + + # 2. Generate Narrative + narrative_service = NarrativeService(model_name=model_name) + narrative_json = narrative_service.generate_narrative(stats_json) + + # 3. Save Snapshot + snapshot = AnalysisSnapshot( + period_start=start_date, + period_end=end_date, + period_label=f"last_{days}_days", + metrics_payload=stats_json, + narrative_report=narrative_json, + model_used=model_name + ) + db.add(snapshot) + db.commit() + db.refresh(snapshot) + + return { + "status": "success", + "snapshot_id": snapshot.id, + "period": {"start": start_date, "end": end_date}, + "metrics": stats_json, + "narrative": narrative_json + } + + except Exception as e: + print(f"Analysis Failed: {e}") + raise HTTPException(status_code=500, detail=str(e)) + +@app.get("/snapshots") +def get_snapshots(limit: int = 10, db: Session = Depends(get_db)): + """Retrieve past analysis snapshots.""" + return db.query(AnalysisSnapshot).order_by(AnalysisSnapshot.date.desc()).limit(limit).all() \ No newline at end of file diff --git a/backend/app/services/narrative_service.py b/backend/app/services/narrative_service.py index f1359ff..72c8f0f 100644 --- a/backend/app/services/narrative_service.py +++ b/backend/app/services/narrative_service.py @@ -18,43 +18,35 @@ class NarrativeService: return {"error": "Missing API Key"} prompt = f""" -You are analyzing a user's Spotify listening data. Below is a JSON summary of metrics I've computed. Your job is to: +You are a witty, insightful, and slightly snarky music critic analyzing a user's listening history. +Below is a JSON summary of their listening data. -1. Write a narrative "Vibe Check" (2-3 paragraphs) describing their overall listening personality this period. -2. Identify 3-5 notable patterns or anomalies. -3. Provide a "Musical Persona" label (e.g., "Late-Night Binge Listener", "Genre Chameleon", "Album Purist"). -4. Write a brief, playful "roast" (1-2 sentences) based on the data. +Your goal is to generate a report that feels like a 'Spotify Wrapped' but deeper and more honest. -Guidelines: -- Do NOT recalculate any numbers. -- Use specific metrics to support observations (e.g., "Your whiplash score of 18.3 BPM suggests..."). -- Keep tone conversational but insightful. -- Avoid mental health claims; stick to behavioral descriptors. -- Highlight both positive patterns and quirks. +Please output your response in strict JSON format with the following keys: +1. "vibe_check": (String) 2-3 paragraphs describing their overall listening personality. +2. "patterns": (List of Strings) 3-5 specific observations based on the data (e.g., "You listen to sad music on Tuesdays", "Your Whiplash Score is high"). +3. "persona": (String) A creative label for the user (e.g., "The Genre Chameleon", "Nostalgic Dad-Rocker", "Algorithm Victim"). +4. "roast": (String) A playful, harmlessly mean roast about their taste (1-2 sentences). +5. "era_insight": (String) A specific comment on their 'Musical Age' and 'Nostalgia Gap'. -Data: +GUIDELINES: +- **Use the Metrics:** Do not just say "You like pop." Say "Your Mainstream Score of 85% suggests you live on the Top 40." +- **Whiplash Score:** If 'whiplash' > 20, comment on their chaotic transitions. +- **Hipster Score:** If 'hipster_score' > 50, call them pretentious; if < 10, call them basic. +- **Comparison:** Use the 'comparison' block to mention if they are listening more/less or if their mood (valence/energy) has shifted. +- **Tone:** Conversational, fun, slightly judgmental but good-natured. + +DATA: {json.dumps(stats_json, indent=2)} -Output Format (return valid JSON): -{{ - "vibe_check": "...", - "patterns": ["...", "..."], - "persona": "...", - "roast": "..." -}} +OUTPUT (JSON): """ try: - # Handle full model path if passed or default short name - # The library often accepts 'gemini-2.5-flash' but list_models returns 'models/gemini-2.5-flash' - model_id = self.model_name - if not model_id.startswith("models/") and "/" not in model_id: - # Try simple name, if it fails user might need to pass 'models/...' - pass - - model = genai.GenerativeModel(model_id) + model = genai.GenerativeModel(self.model_name) response = model.generate_content(prompt) - # Clean up response to ensure valid JSON (sometimes LLMs add markdown blocks) + # Clean up response to ensure valid JSON text = response.text.strip() if text.startswith("```json"): text = text.replace("```json", "").replace("```", "") @@ -64,4 +56,4 @@ Output Format (return valid JSON): return json.loads(text) except Exception as e: - return {"error": str(e), "raw_response": response.text if 'response' in locals() else "No response"} + return {"error": str(e), "raw_response": "Error generating narrative."} \ No newline at end of file diff --git a/backend/app/services/stats_service.py b/backend/app/services/stats_service.py index 4d04ee7..9038fd6 100644 --- a/backend/app/services/stats_service.py +++ b/backend/app/services/stats_service.py @@ -1,5 +1,5 @@ from sqlalchemy.orm import Session -from sqlalchemy import func, distinct, desc +from sqlalchemy import func, distinct, desc, joinedload from datetime import datetime, timedelta from typing import Dict, Any, List import math @@ -11,11 +11,68 @@ class StatsService: def __init__(self, db: Session): self.db = db + from sqlalchemy.orm import joinedload # Add this to imports + + def compute_comparison(self, current_stats: Dict[str, Any], period_start: datetime, period_end: datetime) -> Dict[ + str, Any]: + """ + Calculates deltas vs the previous period of the same length. + """ + duration = period_end - period_start + prev_end = period_start + prev_start = prev_end - duration + + # We only need key metrics for comparison, not the full heavy report + # Let's re-use existing methods but strictly for the previous window + + # 1. Volume Comparison + prev_volume = self.compute_volume_stats(prev_start, prev_end) + + # 2. Vibe Comparison (Just energy/valence/popularity) + prev_vibe = self.compute_vibe_stats(prev_start, prev_end) + prev_taste = self.compute_taste_stats(prev_start, prev_end) + + # Calculate Deltas + deltas = {} + + # Plays + curr_plays = current_stats["volume"]["total_plays"] + prev_plays_count = prev_volume["total_plays"] + deltas["plays_delta"] = curr_plays - prev_plays_count + deltas["plays_pct_change"] = round(((curr_plays - prev_plays_count) / prev_plays_count) * 100, + 1) if prev_plays_count else 0 + + # Energy & Valence + if "mood_quadrant" in current_stats["vibe"] and "mood_quadrant" in prev_vibe: + curr_e = current_stats["vibe"]["mood_quadrant"]["y"] + prev_e = prev_vibe["mood_quadrant"]["y"] + deltas["energy_delta"] = round(curr_e - prev_e, 2) + + curr_v = current_stats["vibe"]["mood_quadrant"]["x"] + prev_v = prev_vibe["mood_quadrant"]["x"] + deltas["valence_delta"] = round(curr_v - prev_v, 2) + + # Popularity + if "avg_popularity" in current_stats["taste"] and "avg_popularity" in prev_taste: + deltas["popularity_delta"] = round(current_stats["taste"]["avg_popularity"] - prev_taste["avg_popularity"], + 1) + + return { + "previous_period": { + "start": prev_start.isoformat(), + "end": prev_end.isoformat() + }, + "deltas": deltas + } + def compute_volume_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: """ - Calculates volume metrics: Total Plays, Unique Tracks, Artists, etc. + Calculates volume metrics including Concentration (HHI, Gini) and One-and-Done rates. """ - query = self.db.query(PlayHistory).filter( + # Eager load tracks AND artists to fix the "Artist String Problem" and performance + query = self.db.query(PlayHistory).options( + joinedload(PlayHistory.track).joinedload(Track.artists) + ).filter( PlayHistory.played_at >= period_start, PlayHistory.played_at <= period_end ) @@ -24,167 +81,94 @@ class StatsService: if total_plays == 0: return { - "total_plays": 0, - "estimated_minutes": 0, - "unique_tracks": 0, - "unique_artists": 0, - "unique_albums": 0, - "unique_genres": 0, - "top_tracks": [], - "top_artists": [], - "repeat_rate": 0, - "concentration": {} + "total_plays": 0, "estimated_minutes": 0, "unique_tracks": 0, + "unique_artists": 0, "unique_albums": 0, "unique_genres": 0, + "top_tracks": [], "top_artists": [], "top_genres": [], + "repeat_rate": 0, "concentration": {} } - # Calculate Duration (Estimated) - # Note: We query tracks to get duration. - # Ideally we join, but eager loading might be heavy. Let's do a join or simple loop. - # Efficient approach: Get all track IDs from plays, fetch Track objects in bulk map. - - track_ids = [p.track_id for p in plays] - tracks = self.db.query(Track).filter(Track.id.in_(set(track_ids))).all() - track_map = {t.id: t for t in tracks} - total_ms = 0 - unique_track_ids = set() - unique_artist_ids = set() - unique_album_names = set() # Spotify doesn't give album ID in PlayHistory directly unless joined, track has album name string. - # Ideally track has raw_data['album']['id']. - unique_album_ids = set() - + track_counts = {} + artist_counts = {} genre_counts = {} - - # For Top Lists - track_play_counts = {} - artist_play_counts = {} + album_ids = set() for p in plays: - t = track_map.get(p.track_id) - if t: - total_ms += t.duration_ms - unique_track_ids.add(t.id) + t = p.track + if not t: continue - # Top Tracks - track_play_counts[t.id] = track_play_counts.get(t.id, 0) + 1 + total_ms += t.duration_ms if t.duration_ms else 0 - # Artists (using relation) - # Note: This might cause N+1 query if not eager loaded. - # For strictly calculation, accessing t.artists (lazy load) loop might be slow for 1000s of plays. - # Optimization: Join PlayHistory -> Track -> Artist in query. + # Track Counts + track_counts[t.id] = track_counts.get(t.id, 0) + 1 - # Let's rely on raw_data for speed if relation loading is slow, - # OR Assume we accept some latency. - # Better: Pre-fetch artist connections or use the new tables properly. - # Let's use the object relation for correctness as per plan. - for artist in t.artists: - unique_artist_ids.add(artist.id) - artist_play_counts[artist.id] = artist_play_counts.get(artist.id, 0) + 1 + # Album Counts (using raw_data ID if available, else name) + if t.raw_data and "album" in t.raw_data and "id" in t.raw_data["album"]: + album_ids.add(t.raw_data["album"]["id"]) + else: + album_ids.add(t.album) - if artist.genres: - for g in artist.genres: - genre_counts[g] = genre_counts.get(g, 0) + 1 + # Artist Counts (Iterate objects, not string) + for artist in t.artists: + artist_counts[artist.id] = artist_counts.get(artist.id, 0) + 1 + if artist.genres: + for g in artist.genres: + genre_counts[g] = genre_counts.get(g, 0) + 1 - if t.raw_data and "album" in t.raw_data: - unique_album_ids.add(t.raw_data["album"]["id"]) - else: - unique_album_ids.add(t.album) # Fallback + # Derived Metrics + unique_tracks = len(track_counts) + one_and_done = len([c for c in track_counts.values() if c == 1]) - estimated_minutes = total_ms / 60000 + # Top Lists + top_tracks = [ + {"name": self.db.query(Track).get(tid).name, "artist": self.db.query(Track).get(tid).artist, "count": c} + for tid, c in sorted(track_counts.items(), key=lambda x: x[1], reverse=True)[:5] + ] - # Top 5 Tracks - sorted_tracks = sorted(track_play_counts.items(), key=lambda x: x[1], reverse=True)[:5] - top_tracks = [] - for tid, count in sorted_tracks: - t = track_map.get(tid) - top_tracks.append({ - "name": t.name, - "artist": t.artist, # Display string - "count": count - }) - - # Top 5 Artists - # Need to fetch Artist names - top_artist_ids = sorted(artist_play_counts.items(), key=lambda x: x[1], reverse=True)[:5] + top_artist_ids = sorted(artist_counts.items(), key=lambda x: x[1], reverse=True)[:5] + # Fetch artist names efficiently top_artists_objs = self.db.query(Artist).filter(Artist.id.in_([x[0] for x in top_artist_ids])).all() - artist_name_map = {a.id: a.name for a in top_artists_objs} + artist_map = {a.id: a.name for a in top_artists_objs} + top_artists = [{"name": artist_map.get(aid, "Unknown"), "count": c} for aid, c in top_artist_ids] - top_artists = [] - for aid, count in top_artist_ids: - top_artists.append({ - "name": artist_name_map.get(aid, "Unknown"), - "count": count - }) + top_genres = [{"name": k, "count": v} for k, v in + sorted(genre_counts.items(), key=lambda x: x[1], reverse=True)[:5]] - # Top Genres - sorted_genres = sorted(genre_counts.items(), key=lambda x: x[1], reverse=True)[:5] - top_genres = [{"name": g, "count": c} for g, c in sorted_genres] + # Concentration (HHI & Gini) + # HHI: Sum of (share)^2 + shares = [c / total_plays for c in track_counts.values()] + hhi = sum([s ** 2 for s in shares]) - # Concentration - unique_tracks_count = len(unique_track_ids) - repeat_rate = (total_plays - unique_tracks_count) / total_plays if total_plays > 0 else 0 - - # HHI (Herfindahl–Hirschman Index) - # Sum of (share)^2. Share = track_plays / total_plays - hhi = sum([(c/total_plays)**2 for c in track_play_counts.values()]) + # Gini Coefficient (Inequality of play distribution) + sorted_shares = sorted(shares) + n = len(shares) + if n > 0: + gini = (2 * sum((i + 1) * x for i, x in enumerate(sorted_shares))) / (n * sum(sorted_shares)) - (n + 1) / n + else: + gini = 0 return { "total_plays": total_plays, - "estimated_minutes": int(estimated_minutes), - "unique_tracks": unique_tracks_count, - "unique_artists": len(unique_artist_ids), - "unique_albums": len(unique_album_ids), + "estimated_minutes": int(total_ms / 60000), + "unique_tracks": unique_tracks, + "unique_artists": len(artist_counts), + "unique_albums": len(album_ids), "unique_genres": len(genre_counts), "top_tracks": top_tracks, "top_artists": top_artists, "top_genres": top_genres, - "repeat_rate": round(repeat_rate, 3), + "repeat_rate": round((total_plays - unique_tracks) / total_plays, 3) if total_plays else 0, + "one_and_done_rate": round(one_and_done / unique_tracks, 3) if unique_tracks else 0, "concentration": { "hhi": round(hhi, 4), - # "gini": ... (skip for now to keep it simple) + "gini": round(gini, 4), + "top_1_share": round(max(shares), 3) if shares else 0 } } def compute_time_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: """ - Hourly, Daily distribution, etc. - """ - query = self.db.query(PlayHistory).filter( - PlayHistory.played_at >= period_start, - PlayHistory.played_at <= period_end - ) - plays = query.all() - - hourly_counts = [0] * 24 - weekday_counts = [0] * 7 # 0=Mon, 6=Sun - - if not plays: - return {"hourly_distribution": hourly_counts} - - for p in plays: - # played_at is UTC in DB usually. Ensure we handle timezone if user wants local. - # For now, assuming UTC or system time. - h = p.played_at.hour - d = p.played_at.weekday() - - hourly_counts[h] += 1 - weekday_counts[d] += 1 - - peak_hour = hourly_counts.index(max(hourly_counts)) - - # Weekend Share - weekend_plays = weekday_counts[5] + weekday_counts[6] - weekend_share = weekend_plays / len(plays) if len(plays) > 0 else 0 - - return { - "hourly_distribution": hourly_counts, - "peak_hour": peak_hour, - "weekday_distribution": weekday_counts, - "weekend_share": round(weekend_share, 2) - } - - def compute_session_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: - """ - Session logic: Gap > 20 mins = new session. + Includes Part-of-Day buckets and Listening Streaks. """ query = self.db.query(PlayHistory).filter( PlayHistory.played_at >= period_start, @@ -193,85 +177,184 @@ class StatsService: plays = query.all() if not plays: - return {"count": 0, "avg_length_minutes": 0} + return {} + + hourly_counts = [0] * 24 + weekday_counts = [0] * 7 + part_of_day = {"morning": 0, "afternoon": 0, "evening": 0, "night": 0} + + # For Streaks + active_dates = set() + + for p in plays: + h = p.played_at.hour + hourly_counts[h] += 1 + weekday_counts[p.played_at.weekday()] += 1 + active_dates.add(p.played_at.date()) + + if 5 <= h < 12: + part_of_day["morning"] += 1 + elif 12 <= h < 17: + part_of_day["afternoon"] += 1 + elif 17 <= h < 22: + part_of_day["evening"] += 1 + else: + part_of_day["night"] += 1 + + # Calculate Streak + sorted_dates = sorted(list(active_dates)) + current_streak = 0 + longest_streak = 0 + if sorted_dates: + current_streak = 1 + longest_streak = 1 + # Check strictly consecutive days + for i in range(1, len(sorted_dates)): + delta = (sorted_dates[i] - sorted_dates[i - 1]).days + if delta == 1: + current_streak += 1 + else: + longest_streak = max(longest_streak, current_streak) + current_streak = 1 + longest_streak = max(longest_streak, current_streak) + + weekend_plays = weekday_counts[5] + weekday_counts[6] + + return { + "hourly_distribution": hourly_counts, + "peak_hour": hourly_counts.index(max(hourly_counts)), + "weekday_distribution": weekday_counts, + "weekend_share": round(weekend_plays / len(plays), 2), + "part_of_day": part_of_day, + "listening_streak": current_streak, + "longest_streak": longest_streak, + "active_days": len(active_dates) + } + + def compute_session_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: + """ + Includes Micro-sessions, Marathon sessions, and Energy Arcs. + """ + # Need to join Track to get Energy features for Arc analysis + query = self.db.query(PlayHistory).options(joinedload(PlayHistory.track)).filter( + PlayHistory.played_at >= period_start, + PlayHistory.played_at <= period_end + ).order_by(PlayHistory.played_at.asc()) + plays = query.all() + + if not plays: + return {"count": 0} sessions = [] current_session = [plays[0]] + # 1. Sessionization (Gap > 20 mins) for i in range(1, len(plays)): - prev = plays[i-1] - curr = plays[i] - diff = (curr.played_at - prev.played_at).total_seconds() / 60 - + diff = (plays[i].played_at - plays[i-1].played_at).total_seconds() / 60 if diff > 20: sessions.append(current_session) current_session = [] - - current_session.append(curr) - + current_session.append(plays[i]) sessions.append(current_session) - session_lengths_min = [] - for sess in sessions: - if len(sess) > 1: - start = sess[0].played_at - end = sess[-1].played_at - # Add duration of last track? - # Let's just do (end - start) for simplicity + avg track duration - duration = (end - start).total_seconds() / 60 - session_lengths_min.append(duration) - else: - session_lengths_min.append(3.0) # Approx 1 track + # 2. Analyze Sessions + lengths_min = [] + micro_sessions = 0 + marathon_sessions = 0 + energy_arcs = {"rising": 0, "falling": 0, "flat": 0, "unknown": 0} - avg_min = sum(session_lengths_min) / len(session_lengths_min) if session_lengths_min else 0 + for sess in sessions: + # Durations + if len(sess) > 1: + duration = (sess[-1].played_at - sess[0].played_at).total_seconds() / 60 + lengths_min.append(duration) + else: + lengths_min.append(3.0) # Approx + + # Types + if len(sess) <= 3: micro_sessions += 1 + if len(sess) >= 20: marathon_sessions += 1 + + # Energy Arc (First vs Last track) + first_t = sess[0].track + last_t = sess[-1].track + if first_t and last_t and first_t.energy is not None and last_t.energy is not None: + diff = last_t.energy - first_t.energy + if diff > 0.1: energy_arcs["rising"] += 1 + elif diff < -0.1: energy_arcs["falling"] += 1 + else: energy_arcs["flat"] += 1 + else: + energy_arcs["unknown"] += 1 + + avg_min = sum(lengths_min) / len(lengths_min) if lengths_min else 0 return { "count": len(sessions), - "avg_tracks": len(plays) / len(sessions), + "avg_tracks": round(len(plays) / len(sessions), 1), "avg_minutes": round(avg_min, 1), - "longest_session_minutes": round(max(session_lengths_min), 1) if session_lengths_min else 0 + "longest_session_minutes": round(max(lengths_min), 1) if lengths_min else 0, + "micro_session_rate": round(micro_sessions / len(sessions), 2), + "marathon_session_rate": round(marathon_sessions / len(sessions), 2), + "energy_arcs": energy_arcs } def compute_vibe_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: """ - Aggregates Audio Features (Energy, Valence, etc.) + Aggregates Audio Features + Calculates Whiplash (Transitions) """ - query = self.db.query(PlayHistory).filter( + # Fetch plays strictly ordered by time for transition analysis + plays = self.db.query(PlayHistory).filter( PlayHistory.played_at >= period_start, PlayHistory.played_at <= period_end - ) - plays = query.all() - track_ids = list(set([p.track_id for p in plays])) + ).order_by(PlayHistory.played_at.asc()).all() - if not track_ids: + if not plays: return {} + track_ids = list(set([p.track_id for p in plays])) tracks = self.db.query(Track).filter(Track.id.in_(track_ids)).all() + track_map = {t.id: t for t in tracks} - # Collect features - features = { - "energy": [], "valence": [], "danceability": [], - "tempo": [], "acousticness": [], "instrumentalness": [], - "liveness": [], "speechiness": [] - } + # 1. Aggregates + features = {k: [] for k in + ["energy", "valence", "danceability", "tempo", "acousticness", "instrumentalness", "liveness", + "speechiness", "loudness"]} - for t in tracks: - # Weight by plays? The spec implies "Per-Period Aggregates". - # Usually weighted by play count is better representation of what was HEARD. - # Let's weight by play count in this period. - play_count = len([p for p in plays if p.track_id == t.id]) + # 2. Transition Arrays (for Whiplash) + transitions = {"tempo": [], "energy": [], "valence": []} + previous_track = None + + for i, p in enumerate(plays): + t = track_map.get(p.track_id) + if not t: continue + + # Populate aggregations if t.energy is not None: - for _ in range(play_count): - features["energy"].append(t.energy) - features["valence"].append(t.valence) - features["danceability"].append(t.danceability) - features["tempo"].append(t.tempo) - features["acousticness"].append(t.acousticness) - features["instrumentalness"].append(t.instrumentalness) - features["liveness"].append(t.liveness) - features["speechiness"].append(t.speechiness) + features["energy"].append(t.energy) + features["valence"].append(t.valence) + features["danceability"].append(t.danceability) + features["tempo"].append(t.tempo) + features["acousticness"].append(t.acousticness) + features["instrumentalness"].append(t.instrumentalness) + features["liveness"].append(t.liveness) + features["speechiness"].append(t.speechiness) + features["loudness"].append(t.loudness) + # Calculate Transitions (Whiplash) + if i > 0 and previous_track: + # Only count transition if within reasonable time (e.g. < 5 mins gap) + # assuming continuous listening + time_diff = (p.played_at - plays[i - 1].played_at).total_seconds() + if time_diff < 300: + if t.tempo and previous_track.tempo: + transitions["tempo"].append(abs(t.tempo - previous_track.tempo)) + if t.energy and previous_track.energy: + transitions["energy"].append(abs(t.energy - previous_track.energy)) + + previous_track = t + + # Calculate Stats stats = {} for key, values in features.items(): valid = [v for v in values if v is not None] @@ -282,46 +365,55 @@ class StatsService: stats[f"avg_{key}"] = None # Derived Metrics - if stats.get("avg_energy") and stats.get("avg_valence"): + if stats.get("avg_energy") is not None and stats.get("avg_valence") is not None: stats["mood_quadrant"] = { "x": round(stats["avg_valence"], 2), "y": round(stats["avg_energy"], 2) } + # Consistency: Inverse of average standard deviation of Mood components + avg_std = (stats["std_energy"] + stats["std_valence"]) / 2 + stats["consistency_score"] = round(1.0 - avg_std, 2) # Higher = more consistent + + # Whiplash Scores (Average jump between tracks) + stats["whiplash"] = {} + for k in ["tempo", "energy"]: + if transitions[k]: + stats["whiplash"][k] = round(float(np.mean(transitions[k])), 2) + else: + stats["whiplash"][k] = 0 return stats def compute_era_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: """ - Musical Age and Era Distribution. + Includes Nostalgia Gap and granular decade breakdown. """ - query = self.db.query(PlayHistory).filter( + # Join track to get raw_data + query = self.db.query(PlayHistory).options(joinedload(PlayHistory.track)).filter( PlayHistory.played_at >= period_start, PlayHistory.played_at <= period_end ) plays = query.all() years = [] - track_ids = list(set([p.track_id for p in plays])) - tracks = self.db.query(Track).filter(Track.id.in_(track_ids)).all() - track_map = {t.id: t for t in tracks} - for p in plays: - t = track_map.get(p.track_id) - if t and t.raw_data and "album" in t.raw_data and "release_date" in t.raw_data["album"]: - rd = t.raw_data["album"]["release_date"] - # Format can be YYYY, YYYY-MM, YYYY-MM-DD - try: - year = int(rd.split("-")[0]) - years.append(year) - except: - pass + t = p.track + if t and t.raw_data and "album" in t.raw_data: + rd = t.raw_data["album"].get("release_date") + if rd: + try: + years.append(int(rd.split("-")[0])) + except: + pass if not years: return {"musical_age": None} + # Musical Age (Weighted Average) avg_year = sum(years) / len(years) + current_year = datetime.utcnow().year - # Decade breakdown + # Decade Distribution decades = {} for y in years: dec = (y // 10) * 10 @@ -329,11 +421,13 @@ class StatsService: decades[label] = decades.get(label, 0) + 1 total = len(years) - decade_dist = {k: round(v/total, 2) for k, v in decades.items()} + dist = {k: round(v / total, 3) for k, v in decades.items()} return { "musical_age": int(avg_year), - "decade_distribution": decade_dist + "nostalgia_gap": int(current_year - avg_year), + "freshness_score": dist.get(f"{int(current_year / 10) * 10}s", 0), # Share of current decade + "decade_distribution": dist } def compute_skip_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: @@ -381,16 +475,191 @@ class StatsService: "skip_rate": round(skips / len(plays), 3) } - def generate_full_report(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: + def compute_context_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: + """ + Analyzes context_uri to determine if user listens to Playlists, Albums, or Artists. + """ + query = self.db.query(PlayHistory).filter( + PlayHistory.played_at >= period_start, + PlayHistory.played_at <= period_end + ) + plays = query.all() + + if not plays: + return {} + + context_counts = {"playlist": 0, "album": 0, "artist": 0, "collection": 0, "unknown": 0} + unique_contexts = {} + + for p in plays: + if not p.context_uri: + context_counts["unknown"] += 1 + continue + + # Count distinct contexts for loyalty + unique_contexts[p.context_uri] = unique_contexts.get(p.context_uri, 0) + 1 + + if "playlist" in p.context_uri: + context_counts["playlist"] += 1 + elif "album" in p.context_uri: + context_counts["album"] += 1 + elif "artist" in p.context_uri: + context_counts["artist"] += 1 + elif "collection" in p.context_uri: + # "Liked Songs" usually shows up as collection + context_counts["collection"] += 1 + else: + context_counts["unknown"] += 1 + + total = len(plays) + breakdown = {k: round(v / total, 2) for k, v in context_counts.items()} + + # Top 5 Contexts (Requires resolving URI to name, possibly missing metadata here) + sorted_contexts = sorted(unique_contexts.items(), key=lambda x: x[1], reverse=True)[:5] + return { - "period": { - "start": period_start.isoformat(), - "end": period_end.isoformat() - }, + "type_breakdown": breakdown, + "album_purist_score": breakdown.get("album", 0), + "playlist_dependency": breakdown.get("playlist", 0), + "context_loyalty": round(len(plays) / len(unique_contexts), 2) if unique_contexts else 0, + "top_context_uris": [{"uri": k, "count": v} for k, v in sorted_contexts] + } + + def compute_taste_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: + """ + Mainstream vs. Hipster analysis based on Track.popularity (0-100). + """ + query = self.db.query(PlayHistory).filter( + PlayHistory.played_at >= period_start, + PlayHistory.played_at <= period_end + ) + plays = query.all() + if not plays: return {} + + track_ids = list(set([p.track_id for p in plays])) + tracks = self.db.query(Track).filter(Track.id.in_(track_ids)).all() + track_map = {t.id: t for t in tracks} + + pop_values = [] + for p in plays: + t = track_map.get(p.track_id) + if t and t.popularity is not None: + pop_values.append(t.popularity) + + if not pop_values: + return {"avg_popularity": 0, "hipster_score": 0} + + avg_pop = float(np.mean(pop_values)) + + # Hipster Score: Percentage of tracks with popularity < 30 + underground_plays = len([x for x in pop_values if x < 30]) + mainstream_plays = len([x for x in pop_values if x > 70]) + + return { + "avg_popularity": round(avg_pop, 1), + "hipster_score": round((underground_plays / len(pop_values)) * 100, 1), + "mainstream_score": round((mainstream_plays / len(pop_values)) * 100, 1), + "obscurity_rating": round(100 - avg_pop, 1) + } + + def compute_lifecycle_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: + """ + Determines if tracks are 'New Discoveries' or 'Old Favorites'. + """ + # 1. Get tracks played in this period + current_plays = self.db.query(PlayHistory).filter( + PlayHistory.played_at >= period_start, + PlayHistory.played_at <= period_end + ).all() + + if not current_plays: return {} + + current_track_ids = set([p.track_id for p in current_plays]) + + # 2. Check if these tracks were played BEFORE period_start + # We find which of the current_track_ids exist in history < period_start + old_tracks_query = self.db.query(distinct(PlayHistory.track_id)).filter( + PlayHistory.track_id.in_(current_track_ids), + PlayHistory.played_at < period_start + ) + old_track_ids = set([r[0] for r in old_tracks_query.all()]) + + # 3. Calculate Discovery + new_discoveries = current_track_ids - old_track_ids + discovery_count = len(new_discoveries) + + # Calculate plays on new discoveries + plays_on_new = len([p for p in current_plays if p.track_id in new_discoveries]) + total_plays = len(current_plays) + + return { + "discovery_count": discovery_count, + "discovery_rate": round(plays_on_new / total_plays, 3) if total_plays > 0 else 0, + "recurrence_rate": round((total_plays - plays_on_new) / total_plays, 3) if total_plays > 0 else 0 + } + + def compute_explicit_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: + """ + Analyzes explicit content consumption. + """ + query = self.db.query(PlayHistory).options(joinedload(PlayHistory.track)).filter( + PlayHistory.played_at >= period_start, + PlayHistory.played_at <= period_end + ) + plays = query.all() + + if not plays: return {"explicit_rate": 0, "hourly_explicit_rate": []} + + total_plays = len(plays) + explicit_count = 0 + hourly_explicit = [0] * 24 + hourly_total = [0] * 24 + + for p in plays: + h = p.played_at.hour + hourly_total[h] += 1 + + # Check raw_data for explicit flag + t = p.track + is_explicit = False + if t.raw_data and t.raw_data.get("explicit"): + is_explicit = True + + if is_explicit: + explicit_count += 1 + hourly_explicit[h] += 1 + + # Calculate hourly percentages + hourly_rates = [] + for i in range(24): + if hourly_total[i] > 0: + hourly_rates.append(round(hourly_explicit[i] / hourly_total[i], 2)) + else: + hourly_rates.append(0.0) + + return { + "explicit_rate": round(explicit_count / total_plays, 3), + "total_explicit_plays": explicit_count, + "hourly_explicit_distribution": hourly_rates + } + + def generate_full_report(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: + # 1. Calculate all current stats + current_stats = { + "period": {"start": period_start.isoformat(), "end": period_end.isoformat()}, "volume": self.compute_volume_stats(period_start, period_end), "time_habits": self.compute_time_stats(period_start, period_end), "sessions": self.compute_session_stats(period_start, period_end), + "context": self.compute_context_stats(period_start, period_end), "vibe": self.compute_vibe_stats(period_start, period_end), "era": self.compute_era_stats(period_start, period_end), + "taste": self.compute_taste_stats(period_start, period_end), + "lifecycle": self.compute_lifecycle_stats(period_start, period_end), + "flags": self.compute_explicit_stats(period_start, period_end), "skips": self.compute_skip_stats(period_start, period_end) } + + # 2. Calculate Comparison + current_stats["comparison"] = self.compute_comparison(current_stats, period_start, period_end) + + return current_stats diff --git a/backend/run_scheduler.py b/backend/run_scheduler.py new file mode 100644 index 0000000..8170965 --- /dev/null +++ b/backend/run_scheduler.py @@ -0,0 +1,16 @@ +import schedule +import time +from run_analysis import run_analysis_pipeline + +def job(): + print("Running daily analysis...") + # Analyze last 24 hours + run_analysis_pipeline(days=1) + +# Schedule for 03:00 AM +schedule.every().day.at("03:00").do(job) + +print("Scheduler started...") +while True: + schedule.run_pending() + time.sleep(60) \ No newline at end of file