From 9b8f7355fb4bcde182ffa1136b82cda3a8c1ed0b Mon Sep 17 00:00:00 2001 From: bnair123 Date: Thu, 25 Dec 2025 22:17:21 +0400 Subject: [PATCH] Fixed and added all the stats_service.py methods --- .idea/vcs.xml | 6 + README.md | 7 +- backend/TECHNICAL_DOCS.md | 23 +- ...9264d3_add_image_url_and_lyrics_columns.py | 36 ++ backend/app/ingest.py | 134 +++++--- backend/app/models.py | 3 + backend/app/services/genius_client.py | 35 ++ backend/app/services/stats_service.py | 313 +++++++++++++----- backend/requirements.txt | 1 + 9 files changed, 412 insertions(+), 146 deletions(-) create mode 100644 .idea/vcs.xml create mode 100644 backend/alembic/versions/f92d8a9264d3_add_image_url_and_lyrics_columns.py create mode 100644 backend/app/services/genius_client.py diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..35eb1dd --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/README.md b/README.md index 2e52a85..a3d1941 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,10 @@ A personal analytics dashboard for your music listening habits, powered by Pytho ## Features - **Continuous Ingestion**: Polls Spotify every 60 seconds to record your listening history. -- **Data Enrichment**: Automatically fetches **Genres** (via Spotify) and **Audio Features** (Energy, BPM, Mood via ReccoBeats). +- **Data Enrichment**: + - **Genres & Images** (via Spotify) + - **Audio Features** (Energy, BPM, Mood via ReccoBeats) + - **Lyrics & Metadata** (via Genius) - **Dashboard**: A responsive UI (Ant Design) to view your history, stats, and "Vibes". - **AI Ready**: Database schema and environment prepared for Gemini AI integration. @@ -18,6 +21,7 @@ You can run this application using Docker Compose. You have two options: using t - **Spotify Developer Credentials** (Client ID & Secret). - **Spotify Refresh Token** (Run `backend/scripts/get_refresh_token.py` locally to generate this). - **Google Gemini API Key**. +- **Genius API Token** (Optional, for lyrics). ### 2. Configuration (`.env`) @@ -28,6 +32,7 @@ SPOTIFY_CLIENT_ID="your_client_id" SPOTIFY_CLIENT_SECRET="your_client_secret" SPOTIFY_REFRESH_TOKEN="your_refresh_token" GEMINI_API_KEY="your_gemini_key" +GENIUS_ACCESS_TOKEN="your_genius_token" ``` ### 3. Run with Docker Compose diff --git a/backend/TECHNICAL_DOCS.md b/backend/TECHNICAL_DOCS.md index ff2c179..b8e6644 100644 --- a/backend/TECHNICAL_DOCS.md +++ b/backend/TECHNICAL_DOCS.md @@ -87,9 +87,28 @@ The LLM returns a JSON object with: ## 3. Data Models (`backend/app/models.py`) -- **Track:** Stores static metadata and audio features. `raw_data` stores the full Spotify JSON for future-proofing. -- **Artist:** Normalized artist entities. Linked to tracks via `track_artists` table. +- **Track:** Stores static metadata and audio features. + - `lyrics`: Full lyrics from Genius (Text). + - `image_url`: Album art URL (String). + - `raw_data`: The full Spotify JSON for future-proofing. +- **Artist:** Normalized artist entities. + - `image_url`: Artist profile image (String). - **PlayHistory:** The timeseries ledger. Links `Track` to a timestamp and context. - **AnalysisSnapshot:** Stores the final output of these services. - `metrics_payload`: The JSON output of `StatsService`. - `narrative_report`: The JSON output of `NarrativeService`. + +## 4. External Integrations + +### Spotify +- **Ingestion:** Polls `recently-played` endpoint every 60s. +- **Enrichment:** Fetches Artist genres and images. + +### Genius +- **Client:** `backend/app/services/genius_client.py`. +- **Function:** Searches for lyrics and high-res album art if missing from Spotify data. +- **Trigger:** Runs during the ingestion loop for new tracks. + +### ReccoBeats +- **Function:** Fetches audio features (Danceability, Energy, Valence) for tracks. + diff --git a/backend/alembic/versions/f92d8a9264d3_add_image_url_and_lyrics_columns.py b/backend/alembic/versions/f92d8a9264d3_add_image_url_and_lyrics_columns.py new file mode 100644 index 0000000..2db2500 --- /dev/null +++ b/backend/alembic/versions/f92d8a9264d3_add_image_url_and_lyrics_columns.py @@ -0,0 +1,36 @@ +"""Add image_url and lyrics columns + +Revision ID: f92d8a9264d3 +Revises: 4401cb416661 +Create Date: 2025-12-25 22:06:05.841447 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = 'f92d8a9264d3' +down_revision: Union[str, Sequence[str], None] = '4401cb416661' +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + """Upgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.add_column('artists', sa.Column('image_url', sa.String(), nullable=True)) + op.add_column('tracks', sa.Column('image_url', sa.String(), nullable=True)) + op.add_column('tracks', sa.Column('lyrics', sa.Text(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + """Downgrade schema.""" + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column('tracks', 'lyrics') + op.drop_column('tracks', 'image_url') + op.drop_column('artists', 'image_url') + # ### end Alembic commands ### diff --git a/backend/app/ingest.py b/backend/app/ingest.py index 5a4ea28..b305196 100644 --- a/backend/app/ingest.py +++ b/backend/app/ingest.py @@ -6,9 +6,10 @@ from .models import Track, PlayHistory, Artist from .database import SessionLocal from .services.spotify_client import SpotifyClient from .services.reccobeats_client import ReccoBeatsClient +from .services.genius_client import GeniusClient from dateutil import parser -# Initialize Spotify Client (env vars will be populated later) +# Initialize Clients def get_spotify_client(): return SpotifyClient( client_id=os.getenv("SPOTIFY_CLIENT_ID"), @@ -19,57 +20,55 @@ def get_spotify_client(): def get_reccobeats_client(): return ReccoBeatsClient() +def get_genius_client(): + return GeniusClient() + async def ensure_artists_exist(db: Session, artists_data: list): """ Ensures that all artists in the list exist in the Artist table. - Returns a list of Artist objects. """ artist_objects = [] for a_data in artists_data: artist_id = a_data["id"] artist = db.query(Artist).filter(Artist.id == artist_id).first() if not artist: + # Check if image is available in this payload (rare for track-linked artists, but possible) + img = None + if "images" in a_data and a_data["images"]: + img = a_data["images"][0]["url"] + artist = Artist( id=artist_id, name=a_data["name"], - genres=[] # Will be enriched later + genres=[], + image_url=img ) db.add(artist) - # We commit inside the loop or after, but for now we rely on the main commit - # However, to return the object correctly we might need to flush if we were doing complex things, - # but here adding to session is enough for SQLAlchemy to track it. artist_objects.append(artist) return artist_objects -async def enrich_tracks(db: Session, spotify_client: SpotifyClient, recco_client: ReccoBeatsClient): +async def enrich_tracks(db: Session, spotify_client: SpotifyClient, recco_client: ReccoBeatsClient, genius_client: GeniusClient): """ - Finds tracks missing genres (Spotify) or audio features (ReccoBeats) and enriches them. - Also enriches Artists with genres. + Enrichment Pipeline: + 1. Audio Features (ReccoBeats) + 2. Artist Metadata: Genres & Images (Spotify) + 3. Lyrics & Fallback Images (Genius) """ - # 1. Enrich Audio Features (via ReccoBeats) + # 1. Enrich Audio Features tracks_missing_features = db.query(Track).filter(Track.danceability == None).limit(50).all() - print(f"DEBUG: Found {len(tracks_missing_features)} tracks missing audio features.") - if tracks_missing_features: - print(f"Enriching {len(tracks_missing_features)} tracks with audio features (ReccoBeats)...") + print(f"Enriching {len(tracks_missing_features)} tracks with audio features...") ids = [t.id for t in tracks_missing_features] - features_list = await recco_client.get_audio_features(ids) - + + # Map features by ID features_map = {} for f in features_list: + # Handle potential ID mismatch or URI format tid = f.get("id") - if not tid and "href" in f: - if "tracks/" in f["href"]: - tid = f["href"].split("tracks/")[1].split("?")[0] - elif "track/" in f["href"]: - tid = f["href"].split("track/")[1].split("?")[0] + if tid: features_map[tid] = f - if tid: - features_map[tid] = f - - updated_count = 0 for track in tracks_missing_features: data = features_map.get(track.id) if data: @@ -84,47 +83,68 @@ async def enrich_tracks(db: Session, spotify_client: SpotifyClient, recco_client track.liveness = data.get("liveness") track.valence = data.get("valence") track.tempo = data.get("tempo") - updated_count += 1 - - print(f"Updated {updated_count} tracks with audio features.") + db.commit() - # 2. Enrich Artist Genres (via Spotify Artists) - # We look for artists who have no genres. Note: an artist might genuinely have no genres, - # so we might need a flag "genres_checked" in the future, but for now checking empty list is okay. - # However, newly created artists have genres=[] (empty list) or None? - # My model definition: genres = Column(JSON, nullable=True) - # So if it is None, we haven't fetched it. - - artists_missing_genres = db.query(Artist).filter(Artist.genres == None).limit(50).all() - - if artists_missing_genres: - print(f"Enriching {len(artists_missing_genres)} artists with genres (Spotify)...") - artist_ids_list = [a.id for a in artists_missing_genres] - + # 2. Enrich Artist Genres & Images (Spotify) + artists_missing_data = db.query(Artist).filter((Artist.genres == None) | (Artist.image_url == None)).limit(50).all() + if artists_missing_data: + print(f"Enriching {len(artists_missing_data)} artists with genres/images...") + artist_ids_list = [a.id for a in artists_missing_data] + artist_data_map = {} - # Spotify allows fetching 50 artists at a time for i in range(0, len(artist_ids_list), 50): chunk = artist_ids_list[i:i+50] artists_data = await spotify_client.get_artists(chunk) for a_data in artists_data: if a_data: - artist_data_map[a_data["id"]] = a_data.get("genres", []) + img = a_data["images"][0]["url"] if a_data.get("images") else None + artist_data_map[a_data["id"]] = { + "genres": a_data.get("genres", []), + "image_url": img + } - for artist in artists_missing_genres: - genres = artist_data_map.get(artist.id) - if genres is not None: - artist.genres = genres + for artist in artists_missing_data: + data = artist_data_map.get(artist.id) + if data: + if artist.genres is None: artist.genres = data["genres"] + if artist.image_url is None: artist.image_url = data["image_url"] + elif artist.genres is None: + artist.genres = [] # Prevent retry loop + + db.commit() + + # 3. Enrich Lyrics (Genius) + # Only fetch for tracks that have been played recently to avoid spamming Genius API + tracks_missing_lyrics = db.query(Track).filter(Track.lyrics == None).order_by(Track.updated_at.desc()).limit(10).all() + + if tracks_missing_lyrics and genius_client.genius: + print(f"Enriching {len(tracks_missing_lyrics)} tracks with lyrics (Genius)...") + for track in tracks_missing_lyrics: + # We need the primary artist name + artist_name = track.artist.split(",")[0] # Heuristic: take first artist + + print(f"Searching Genius for: {track.name} by {artist_name}") + data = genius_client.search_song(track.name, artist_name) + + if data: + track.lyrics = data["lyrics"] + # Fallback: if we didn't get high-res art from Spotify, use Genius + if not track.image_url and data.get("image_url"): + track.image_url = data["image_url"] else: - # If we couldn't fetch, set to empty list so we don't keep retrying forever (or handle errors better) - artist.genres = [] - + track.lyrics = "" # Mark as empty to prevent retry loop + + # Small sleep to be nice to API? GeniusClient is synchronous. + # We are in async function but GeniusClient is blocking. It's fine for worker. + db.commit() async def ingest_recently_played(db: Session): spotify_client = get_spotify_client() recco_client = get_reccobeats_client() + genius_client = get_genius_client() try: items = await spotify_client.get_recently_played(limit=50) @@ -144,11 +164,18 @@ async def ingest_recently_played(db: Session): if not track: print(f"New track found: {track_data['name']}") + + # Extract Album Art + image_url = None + if track_data.get("album") and track_data["album"].get("images"): + image_url = track_data["album"]["images"][0]["url"] + track = Track( id=track_id, name=track_data["name"], - artist=", ".join([a["name"] for a in track_data["artists"]]), # Legacy string + artist=", ".join([a["name"] for a in track_data["artists"]]), album=track_data["album"]["name"], + image_url=image_url, duration_ms=track_data["duration_ms"], popularity=track_data["popularity"], raw_data=track_data @@ -162,11 +189,8 @@ async def ingest_recently_played(db: Session): db.add(track) db.commit() - # Ensure relationships exist even if track existed (e.g. migration) - # Check if track has artists linked. If not (and raw_data has them), link them. - # FIX: Logic was previously indented improperly inside `if not track`. + # Ensure relationships exist logic... if not track.artists and track.raw_data and "artists" in track.raw_data: - print(f"Backfilling artists for track {track.name}") artist_objects = await ensure_artists_exist(db, track.raw_data["artists"]) track.artists = artist_objects db.commit() @@ -188,7 +212,7 @@ async def ingest_recently_played(db: Session): db.commit() # Enrich - await enrich_tracks(db, spotify_client, recco_client) + await enrich_tracks(db, spotify_client, recco_client, genius_client) async def run_worker(): """Simulates a background worker loop.""" diff --git a/backend/app/models.py b/backend/app/models.py index 4accd8d..3751adf 100644 --- a/backend/app/models.py +++ b/backend/app/models.py @@ -17,6 +17,7 @@ class Artist(Base): id = Column(String, primary_key=True, index=True) # Spotify ID name = Column(String) genres = Column(JSON, nullable=True) # List of genre strings + image_url = Column(String, nullable=True) # Artist profile image # Relationships tracks = relationship("Track", secondary=track_artists, back_populates="artists") @@ -28,6 +29,7 @@ class Track(Base): name = Column(String) artist = Column(String) # Display string (e.g. "Drake, Future") - kept for convenience album = Column(String) + image_url = Column(String, nullable=True) # Album art duration_ms = Column(Integer) popularity = Column(Integer, nullable=True) @@ -53,6 +55,7 @@ class Track(Base): genres = Column(JSON, nullable=True) # AI Analysis fields + lyrics = Column(Text, nullable=True) # Full lyrics from Genius lyrics_summary = Column(String, nullable=True) genre_tags = Column(String, nullable=True) diff --git a/backend/app/services/genius_client.py b/backend/app/services/genius_client.py new file mode 100644 index 0000000..a67511b --- /dev/null +++ b/backend/app/services/genius_client.py @@ -0,0 +1,35 @@ +import os +import lyricsgenius +from typing import Optional, Dict, Any + +class GeniusClient: + def __init__(self): + self.access_token = os.getenv("GENIUS_ACCESS_TOKEN") + if self.access_token: + self.genius = lyricsgenius.Genius(self.access_token, verbose=False, remove_section_headers=True) + else: + print("WARNING: GENIUS_ACCESS_TOKEN not found. Lyrics enrichment will be skipped.") + self.genius = None + + def search_song(self, title: str, artist: str) -> Optional[Dict[str, Any]]: + """ + Searches for a song on Genius and returns metadata + lyrics. + """ + if not self.genius: + return None + + try: + # Clean up title (remove "Feat.", "Remastered", etc for better search match) + clean_title = title.split(" - ")[0].split("(")[0].strip() + song = self.genius.search_song(clean_title, artist) + + if song: + return { + "lyrics": song.lyrics, + "image_url": song.song_art_image_url, + "artist_image_url": song.primary_artist.image_url + } + except Exception as e: + print(f"Genius Search Error for {title} by {artist}: {e}") + + return None diff --git a/backend/app/services/stats_service.py b/backend/app/services/stats_service.py index a3dc33f..d2a5eda 100644 --- a/backend/app/services/stats_service.py +++ b/backend/app/services/stats_service.py @@ -4,6 +4,7 @@ from datetime import datetime, timedelta from typing import Dict, Any, List, Optional import math import numpy as np +from sklearn.cluster import KMeans from ..models import PlayHistory, Track, Artist @@ -78,10 +79,18 @@ class StatsService: genre_counts = {} album_counts = {} - # Maps for resolving names later without DB hits + # Maps for resolving names/images later without DB hits track_map = {} artist_map = {} album_map = {} + + # Helper to safely get image + def get_track_image(t): + if t.image_url: return t.image_url + if t.raw_data and "album" in t.raw_data and "images" in t.raw_data["album"]: + imgs = t.raw_data["album"]["images"] + if imgs: return imgs[0].get("url") + return None for p in plays: t = p.track @@ -102,12 +111,15 @@ class StatsService: album_name = t.raw_data["album"].get("name", t.album) album_counts[album_id] = album_counts.get(album_id, 0) + 1 - album_map[album_id] = album_name + # Store tuple of (name, image_url) + if album_id not in album_map: + album_map[album_id] = {"name": album_name, "image": get_track_image(t)} # Artist Aggregation (Iterate objects, not string) for artist in t.artists: artist_counts[artist.id] = artist_counts.get(artist.id, 0) + 1 - artist_map[artist.id] = artist.name + if artist.id not in artist_map: + artist_map[artist.id] = {"name": artist.name, "image": artist.image_url} # Genre Aggregation if artist.genres: @@ -124,19 +136,20 @@ class StatsService: top_tracks = [ { "name": track_map[tid].name, - "artist": ", ".join([a.name for a in track_map[tid].artists]), # Correct artist display + "artist": ", ".join([a.name for a in track_map[tid].artists]), + "image": get_track_image(track_map[tid]), "count": c } for tid, c in sorted(track_counts.items(), key=lambda x: x[1], reverse=True)[:5] ] top_artists = [ - {"name": artist_map.get(aid, "Unknown"), "count": c} + {"name": artist_map[aid]["name"], "id": aid, "image": artist_map[aid]["image"], "count": c} for aid, c in sorted(artist_counts.items(), key=lambda x: x[1], reverse=True)[:5] ] top_albums = [ - {"name": album_map.get(aid, "Unknown"), "count": c} + {"name": album_map[aid]["name"], "image": album_map[aid]["image"], "count": c} for aid, c in sorted(album_counts.items(), key=lambda x: x[1], reverse=True)[:5] ] @@ -188,7 +201,7 @@ class StatsService: def compute_time_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: """ - Includes Part-of-Day buckets, Listening Streaks, and Active Days stats. + Includes Part-of-Day buckets, Listening Streaks, Active Days, and 2D Heatmap. """ query = self.db.query(PlayHistory).filter( PlayHistory.played_at >= period_start, @@ -199,16 +212,24 @@ class StatsService: if not plays: return {} + # Heatmap: 7 days x 24 hours + heatmap = [[0 for _ in range(24)] for _ in range(7)] + hourly_counts = [0] * 24 weekday_counts = [0] * 7 - # Spec: Morning (6-12), Afternoon (12-18), Evening (18-24), Night (0-6) + part_of_day = {"morning": 0, "afternoon": 0, "evening": 0, "night": 0} active_dates = set() for p in plays: h = p.played_at.hour + d = p.played_at.weekday() + + # Populate Heatmap + heatmap[d][h] += 1 + hourly_counts[h] += 1 - weekday_counts[p.played_at.weekday()] += 1 + weekday_counts[d] += 1 active_dates.add(p.played_at.date()) if 6 <= h < 12: @@ -240,6 +261,7 @@ class StatsService: active_days_count = len(active_dates) return { + "heatmap": heatmap, # 7x24 Matrix "hourly_distribution": hourly_counts, "peak_hour": hourly_counts.index(max(hourly_counts)), "weekday_distribution": weekday_counts, @@ -253,7 +275,7 @@ class StatsService: def compute_session_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: """ - Includes Micro-sessions, Marathon sessions, Energy Arcs, and Median metrics. + Includes Micro-sessions, Marathon sessions, Energy Arcs, Median metrics, and Session List. """ query = self.db.query(PlayHistory).options(joinedload(PlayHistory.track)).filter( PlayHistory.played_at >= period_start, @@ -282,21 +304,41 @@ class StatsService: marathon_sessions = 0 energy_arcs = {"rising": 0, "falling": 0, "flat": 0, "unknown": 0} start_hour_dist = [0] * 24 + + session_list = [] # Metadata for timeline for sess in sessions: + start_t = sess[0].played_at + end_t = sess[-1].played_at + # Start time distribution - start_hour_dist[sess[0].played_at.hour] += 1 + start_hour_dist[start_t.hour] += 1 # Durations if len(sess) > 1: - duration = (sess[-1].played_at - sess[0].played_at).total_seconds() / 60 + duration = (end_t - start_t).total_seconds() / 60 lengths_min.append(duration) else: - lengths_min.append(3.0) # Approx single song + duration = 3.0 # Approx single song + lengths_min.append(duration) # Types - if len(sess) <= 3: micro_sessions += 1 - if len(sess) >= 20: marathon_sessions += 1 + sess_type = "Standard" + if len(sess) <= 3: + micro_sessions += 1 + sess_type = "Micro" + elif len(sess) >= 20: + marathon_sessions += 1 + sess_type = "Marathon" + + # Store Session Metadata + session_list.append({ + "start_time": start_t.isoformat(), + "end_time": end_t.isoformat(), + "duration_minutes": round(duration, 1), + "track_count": len(sess), + "type": sess_type + }) # Energy Arc first_t = sess[0].track @@ -326,12 +368,13 @@ class StatsService: "start_hour_distribution": start_hour_dist, "micro_session_rate": round(micro_sessions / len(sessions), 2), "marathon_session_rate": round(marathon_sessions / len(sessions), 2), - "energy_arcs": energy_arcs + "energy_arcs": energy_arcs, + "session_list": session_list } def compute_vibe_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: """ - Aggregates Audio Features + Calculates Whiplash, Percentiles, and Profiles. + Aggregates Audio Features + Calculates Whiplash + Clustering + Harmonic Profile. """ plays = self.db.query(PlayHistory).filter( PlayHistory.played_at >= period_start, @@ -349,6 +392,14 @@ class StatsService: feature_keys = ["energy", "valence", "danceability", "tempo", "acousticness", "instrumentalness", "liveness", "speechiness", "loudness"] features = {k: [] for k in feature_keys} + + # For Clustering: List of [energy, valence, danceability, acousticness] + cluster_data = [] + + # For Harmonic & Tempo + keys = [] + modes = [] + tempo_zones = {"chill": 0, "groove": 0, "hype": 0} # 2. Transition Arrays (for Whiplash) transitions = {"tempo": [], "energy": [], "valence": []} @@ -364,6 +415,20 @@ class StatsService: val = getattr(t, key, None) if val is not None: features[key].append(val) + + # Cluster Data (only if all 4 exist) + if all(getattr(t, k) is not None for k in ["energy", "valence", "danceability", "acousticness"]): + cluster_data.append([t.energy, t.valence, t.danceability, t.acousticness]) + + # Harmonic + if t.key is not None: keys.append(t.key) + if t.mode is not None: modes.append(t.mode) + + # Tempo Zones + if t.tempo is not None: + if t.tempo < 100: tempo_zones["chill"] += 1 + elif t.tempo < 130: tempo_zones["groove"] += 1 + else: tempo_zones["hype"] += 1 # Calculate Transitions (Whiplash) if i > 0 and previous_track: @@ -381,12 +446,13 @@ class StatsService: # Calculate Stats (Mean, Std, Percentiles) stats = {} for key, values in features.items(): - if values: - stats[f"avg_{key}"] = float(np.mean(values)) - stats[f"std_{key}"] = float(np.std(values)) - stats[f"p10_{key}"] = float(np.percentile(values, 10)) - stats[f"p50_{key}"] = float(np.percentile(values, 50)) # Median - stats[f"p90_{key}"] = float(np.percentile(values, 90)) + valid = [v for v in values if v is not None] + if valid: + stats[f"avg_{key}"] = float(np.mean(valid)) + stats[f"std_{key}"] = float(np.std(valid)) + stats[f"p10_{key}"] = float(np.percentile(valid, 10)) + stats[f"p50_{key}"] = float(np.percentile(valid, 50)) # Median + stats[f"p90_{key}"] = float(np.percentile(valid, 90)) else: stats[f"avg_{key}"] = None @@ -396,31 +462,97 @@ class StatsService: "x": round(stats["avg_valence"], 2), "y": round(stats["avg_energy"], 2) } - # Consistency avg_std = (stats.get("std_energy", 0) + stats.get("std_valence", 0)) / 2 stats["consistency_score"] = round(1.0 - avg_std, 2) - # Rhythm Profile if stats.get("avg_tempo") is not None and stats.get("avg_danceability") is not None: stats["rhythm_profile"] = { "avg_tempo": round(stats["avg_tempo"], 1), "avg_danceability": round(stats["avg_danceability"], 2) } - # Texture Profile if stats.get("avg_acousticness") is not None and stats.get("avg_instrumentalness") is not None: stats["texture_profile"] = { "acousticness": round(stats["avg_acousticness"], 2), "instrumentalness": round(stats["avg_instrumentalness"], 2) } - # Whiplash Scores + # Whiplash stats["whiplash"] = {} for k in ["tempo", "energy", "valence"]: if transitions[k]: stats["whiplash"][k] = round(float(np.mean(transitions[k])), 2) else: stats["whiplash"][k] = 0 + + # Tempo Zones + total_tempo = sum(tempo_zones.values()) + if total_tempo > 0: + stats["tempo_zones"] = {k: round(v / total_tempo, 2) for k, v in tempo_zones.items()} + else: + stats["tempo_zones"] = {} + + # Harmonic Profile + if modes: + major_count = len([m for m in modes if m == 1]) + stats["harmonic_profile"] = { + "major_pct": round(major_count / len(modes), 2), + "minor_pct": round((len(modes) - major_count) / len(modes), 2) + } + + if keys: + # Map integers to pitch class notation + pitch_class = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"] + key_counts = {} + for k in keys: + if 0 <= k < 12: + label = pitch_class[k] + key_counts[label] = key_counts.get(label, 0) + 1 + stats["top_keys"] = [{"key": k, "count": v} for k, v in sorted(key_counts.items(), key=lambda x: x[1], reverse=True)[:3]] + + # CLUSTERING (K-Means) + if len(cluster_data) >= 5: # Need enough data points + try: + # Features: energy, valence, danceability, acousticness + kmeans = KMeans(n_clusters=3, random_state=42, n_init=10) + labels = kmeans.fit_predict(cluster_data) + + # Analyze clusters + clusters = [] + for i in range(3): + mask = (labels == i) + count = np.sum(mask) + if count == 0: continue + + centroid = kmeans.cluster_centers_[i] + share = count / len(cluster_data) + + # Heuristic Naming + c_energy, c_valence, c_dance, c_acoustic = centroid + name = "Mixed Vibe" + if c_energy > 0.7: name = "High Energy" + elif c_acoustic > 0.7: name = "Acoustic / Chill" + elif c_valence < 0.3: name = "Melancholy" + elif c_dance > 0.7: name = "Dance / Groove" + + clusters.append({ + "name": name, + "share": round(share, 2), + "features": { + "energy": round(c_energy, 2), + "valence": round(c_valence, 2), + "danceability": round(c_dance, 2), + "acousticness": round(c_acoustic, 2) + } + }) + + # Sort by share + stats["clusters"] = sorted(clusters, key=lambda x: x["share"], reverse=True) + except Exception as e: + print(f"Clustering failed: {e}") + stats["clusters"] = [] + else: + stats["clusters"] = [] return stats @@ -448,9 +580,11 @@ class StatsService: if not years: return {"musical_age": None} + # Musical Age (Weighted Average) avg_year = sum(years) / len(years) current_year = datetime.utcnow().year + # Decade Distribution decades = {} for y in years: dec = (y // 10) * 10 @@ -463,17 +597,18 @@ class StatsService: return { "musical_age": int(avg_year), "nostalgia_gap": int(current_year - avg_year), - "freshness_score": dist.get(f"{int(current_year / 10) * 10}s", 0), + "freshness_score": dist.get(f"{int(current_year / 10) * 10}s", 0), # Share of current decade "decade_distribution": dist } def compute_skip_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: """ - Implements boredom skip detection. + Implements boredom skip detection: + (next_track.played_at - current_track.played_at) < (current_track.duration_ms / 1000 - 10s) """ query = self.db.query(PlayHistory).filter( PlayHistory.played_at >= period_start, - PlayHistory.played_at < period_end + PlayHistory.played_at <= period_end ).order_by(PlayHistory.played_at.asc()) plays = query.all() @@ -485,10 +620,7 @@ class StatsService: tracks = self.db.query(Track).filter(Track.id.in_(track_ids)).all() track_map = {t.id: t for t in tracks} - # Denominator: transitions, which is plays - 1 - transitions_count = len(plays) - 1 - - for i in range(transitions_count): + for i in range(len(plays) - 1): current_play = plays[i] next_play = plays[i+1] track = track_map.get(current_play.track_id) @@ -497,28 +629,31 @@ class StatsService: continue diff_seconds = (next_play.played_at - current_play.played_at).total_seconds() - duration_sec = track.duration_ms / 1000.0 # Logic: If diff < (duration - 10s), it's a skip. - # AND it must be a "valid" listening attempt (e.g. > 30s) - # AND it shouldn't be a huge gap (e.g. paused for 2 hours then hit next) - - if 30 < diff_seconds < (duration_sec - 10): + # Convert duration to seconds + duration_sec = track.duration_ms / 1000.0 + + # Also ensure diff isn't negative or weirdly small (re-plays) + # And assume "listening" means diff > 30s at least? + # Spec says "Spotify only returns 30s+". + + if diff_seconds < (duration_sec - 10): skips += 1 return { "total_skips": skips, - "skip_rate": round(skips / transitions_count, 3) if transitions_count > 0 else 0 + "skip_rate": round(skips / len(plays), 3) } def compute_context_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: """ - Analyzes context_uri and switching rate. + Analyzes context_uri to determine if user listens to Playlists, Albums, or Artists. """ query = self.db.query(PlayHistory).filter( PlayHistory.played_at >= period_start, - PlayHistory.played_at < period_end - ).order_by(PlayHistory.played_at.asc()) + PlayHistory.played_at <= period_end + ) plays = query.all() if not plays: @@ -526,32 +661,31 @@ class StatsService: context_counts = {"playlist": 0, "album": 0, "artist": 0, "collection": 0, "unknown": 0} unique_contexts = {} - context_switches = 0 - - last_context = None for p in plays: - uri = p.context_uri - if not uri: + if not p.context_uri: context_counts["unknown"] += 1 - uri = "unknown" - else: - if "playlist" in uri: context_counts["playlist"] += 1 - elif "album" in uri: context_counts["album"] += 1 - elif "artist" in uri: context_counts["artist"] += 1 - elif "collection" in uri: context_counts["collection"] += 1 - else: context_counts["unknown"] += 1 + continue - if uri != "unknown": - unique_contexts[uri] = unique_contexts.get(uri, 0) + 1 - - # Switch detection - if last_context and uri != last_context: - context_switches += 1 - last_context = uri + # Count distinct contexts for loyalty + unique_contexts[p.context_uri] = unique_contexts.get(p.context_uri, 0) + 1 + + if "playlist" in p.context_uri: + context_counts["playlist"] += 1 + elif "album" in p.context_uri: + context_counts["album"] += 1 + elif "artist" in p.context_uri: + context_counts["artist"] += 1 + elif "collection" in p.context_uri: + # "Liked Songs" usually shows up as collection + context_counts["collection"] += 1 + else: + context_counts["unknown"] += 1 total = len(plays) breakdown = {k: round(v / total, 2) for k, v in context_counts.items()} + + # Top 5 Contexts (Requires resolving URI to name, possibly missing metadata here) sorted_contexts = sorted(unique_contexts.items(), key=lambda x: x[1], reverse=True)[:5] return { @@ -559,17 +693,16 @@ class StatsService: "album_purist_score": breakdown.get("album", 0), "playlist_dependency": breakdown.get("playlist", 0), "context_loyalty": round(len(plays) / len(unique_contexts), 2) if unique_contexts else 0, - "context_switching_rate": round(context_switches / (total - 1), 2) if total > 1 else 0, "top_context_uris": [{"uri": k, "count": v} for k, v in sorted_contexts] } def compute_taste_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: """ - Mainstream vs. Hipster analysis. + Mainstream vs. Hipster analysis based on Track.popularity (0-100). """ query = self.db.query(PlayHistory).filter( PlayHistory.played_at >= period_start, - PlayHistory.played_at < period_end + PlayHistory.played_at <= period_end ) plays = query.all() if not plays: return {} @@ -602,47 +735,38 @@ class StatsService: def compute_lifecycle_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: """ - Discovery, Recurrence, Comebacks, Obsessions. + Determines if tracks are 'New Discoveries' or 'Old Favorites'. """ - # 1. Current plays + # 1. Get tracks played in this period current_plays = self.db.query(PlayHistory).filter( PlayHistory.played_at >= period_start, - PlayHistory.played_at < period_end + PlayHistory.played_at <= period_end ).all() if not current_plays: return {} current_track_ids = set([p.track_id for p in current_plays]) - # 2. Historical check + # 2. Check if these tracks were played BEFORE period_start + # We find which of the current_track_ids exist in history < period_start old_tracks_query = self.db.query(distinct(PlayHistory.track_id)).filter( PlayHistory.track_id.in_(current_track_ids), PlayHistory.played_at < period_start ) old_track_ids = set([r[0] for r in old_tracks_query.all()]) - # 3. Discovery + # 3. Calculate Discovery new_discoveries = current_track_ids - old_track_ids - - # 4. Obsessions (Tracks with > 5 plays in period) - track_counts = {} - for p in current_plays: - track_counts[p.track_id] = track_counts.get(p.track_id, 0) + 1 - obsessions = [tid for tid, count in track_counts.items() if count >= 5] - - # 5. Comeback Detection (Old tracks not played in last 30 days) - # Simplified: If in old_track_ids but NOT in last 30 days before period_start? - # That requires a gap check. For now, we will mark 'recurrence' as general relistening. - + discovery_count = len(new_discoveries) + + # Calculate plays on new discoveries plays_on_new = len([p for p in current_plays if p.track_id in new_discoveries]) total_plays = len(current_plays) return { - "discovery_count": len(new_discoveries), + "discovery_count": discovery_count, "discovery_rate": round(plays_on_new / total_plays, 3) if total_plays > 0 else 0, - "recurrence_rate": round((total_plays - plays_on_new) / total_plays, 3) if total_plays > 0 else 0, - "obsession_count": len(obsessions), - "obsession_rate": round(len(obsessions) / len(current_track_ids), 3) if current_track_ids else 0 + "recurrence_rate": round((total_plays - plays_on_new) / total_plays, 3) if total_plays > 0 else 0 } def compute_explicit_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: @@ -651,7 +775,7 @@ class StatsService: """ query = self.db.query(PlayHistory).options(joinedload(PlayHistory.track)).filter( PlayHistory.played_at >= period_start, - PlayHistory.played_at < period_end + PlayHistory.played_at <= period_end ) plays = query.all() @@ -665,14 +789,24 @@ class StatsService: for p in plays: h = p.played_at.hour hourly_total[h] += 1 + + # Check raw_data for explicit flag t = p.track + is_explicit = False if t.raw_data and t.raw_data.get("explicit"): + is_explicit = True + + if is_explicit: explicit_count += 1 hourly_explicit[h] += 1 + # Calculate hourly percentages hourly_rates = [] for i in range(24): - hourly_rates.append(round(hourly_explicit[i] / hourly_total[i], 2) if hourly_total[i] > 0 else 0.0) + if hourly_total[i] > 0: + hourly_rates.append(round(hourly_explicit[i] / hourly_total[i], 2)) + else: + hourly_rates.append(0.0) return { "explicit_rate": round(explicit_count / total_plays, 3), @@ -681,6 +815,7 @@ class StatsService: } def generate_full_report(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: + # 1. Calculate all current stats current_stats = { "period": {"start": period_start.isoformat(), "end": period_end.isoformat()}, "volume": self.compute_volume_stats(period_start, period_end), @@ -695,7 +830,9 @@ class StatsService: "skips": self.compute_skip_stats(period_start, period_end) } + # 2. Calculate Comparison current_stats["comparison"] = self.compute_comparison(current_stats, period_start, period_end) + return current_stats def _empty_volume_stats(self): @@ -710,4 +847,4 @@ class StatsService: def _pct_change(self, curr, prev): if prev == 0: return 100.0 if curr > 0 else 0.0 - return round(((curr - prev) / prev) * 100, 1) + return round(((curr - prev) / prev) * 100, 1) \ No newline at end of file diff --git a/backend/requirements.txt b/backend/requirements.txt index 3bd8239..488ba79 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -11,3 +11,4 @@ python-dateutil==2.9.0.post0 requests==2.31.0 alembic==1.13.1 scikit-learn==1.4.0 +lyricsgenius==3.0.1