Fixed and added all the stats_service.py methods

This commit is contained in:
bnair123
2025-12-25 22:17:21 +04:00
parent e7980cc706
commit 9b8f7355fb
9 changed files with 412 additions and 146 deletions

6
.idea/vcs.xml generated Normal file
View File

@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="VcsDirectoryMappings">
<mapping directory="" vcs="Git" />
</component>
</project>

View File

@@ -5,7 +5,10 @@ A personal analytics dashboard for your music listening habits, powered by Pytho
## Features ## Features
- **Continuous Ingestion**: Polls Spotify every 60 seconds to record your listening history. - **Continuous Ingestion**: Polls Spotify every 60 seconds to record your listening history.
- **Data Enrichment**: Automatically fetches **Genres** (via Spotify) and **Audio Features** (Energy, BPM, Mood via ReccoBeats). - **Data Enrichment**:
- **Genres & Images** (via Spotify)
- **Audio Features** (Energy, BPM, Mood via ReccoBeats)
- **Lyrics & Metadata** (via Genius)
- **Dashboard**: A responsive UI (Ant Design) to view your history, stats, and "Vibes". - **Dashboard**: A responsive UI (Ant Design) to view your history, stats, and "Vibes".
- **AI Ready**: Database schema and environment prepared for Gemini AI integration. - **AI Ready**: Database schema and environment prepared for Gemini AI integration.
@@ -18,6 +21,7 @@ You can run this application using Docker Compose. You have two options: using t
- **Spotify Developer Credentials** (Client ID & Secret). - **Spotify Developer Credentials** (Client ID & Secret).
- **Spotify Refresh Token** (Run `backend/scripts/get_refresh_token.py` locally to generate this). - **Spotify Refresh Token** (Run `backend/scripts/get_refresh_token.py` locally to generate this).
- **Google Gemini API Key**. - **Google Gemini API Key**.
- **Genius API Token** (Optional, for lyrics).
### 2. Configuration (`.env`) ### 2. Configuration (`.env`)
@@ -28,6 +32,7 @@ SPOTIFY_CLIENT_ID="your_client_id"
SPOTIFY_CLIENT_SECRET="your_client_secret" SPOTIFY_CLIENT_SECRET="your_client_secret"
SPOTIFY_REFRESH_TOKEN="your_refresh_token" SPOTIFY_REFRESH_TOKEN="your_refresh_token"
GEMINI_API_KEY="your_gemini_key" GEMINI_API_KEY="your_gemini_key"
GENIUS_ACCESS_TOKEN="your_genius_token"
``` ```
### 3. Run with Docker Compose ### 3. Run with Docker Compose

View File

@@ -87,9 +87,28 @@ The LLM returns a JSON object with:
## 3. Data Models (`backend/app/models.py`) ## 3. Data Models (`backend/app/models.py`)
- **Track:** Stores static metadata and audio features. `raw_data` stores the full Spotify JSON for future-proofing. - **Track:** Stores static metadata and audio features.
- **Artist:** Normalized artist entities. Linked to tracks via `track_artists` table. - `lyrics`: Full lyrics from Genius (Text).
- `image_url`: Album art URL (String).
- `raw_data`: The full Spotify JSON for future-proofing.
- **Artist:** Normalized artist entities.
- `image_url`: Artist profile image (String).
- **PlayHistory:** The timeseries ledger. Links `Track` to a timestamp and context. - **PlayHistory:** The timeseries ledger. Links `Track` to a timestamp and context.
- **AnalysisSnapshot:** Stores the final output of these services. - **AnalysisSnapshot:** Stores the final output of these services.
- `metrics_payload`: The JSON output of `StatsService`. - `metrics_payload`: The JSON output of `StatsService`.
- `narrative_report`: The JSON output of `NarrativeService`. - `narrative_report`: The JSON output of `NarrativeService`.
## 4. External Integrations
### Spotify
- **Ingestion:** Polls `recently-played` endpoint every 60s.
- **Enrichment:** Fetches Artist genres and images.
### Genius
- **Client:** `backend/app/services/genius_client.py`.
- **Function:** Searches for lyrics and high-res album art if missing from Spotify data.
- **Trigger:** Runs during the ingestion loop for new tracks.
### ReccoBeats
- **Function:** Fetches audio features (Danceability, Energy, Valence) for tracks.

View File

@@ -0,0 +1,36 @@
"""Add image_url and lyrics columns
Revision ID: f92d8a9264d3
Revises: 4401cb416661
Create Date: 2025-12-25 22:06:05.841447
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision: str = 'f92d8a9264d3'
down_revision: Union[str, Sequence[str], None] = '4401cb416661'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
"""Upgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.add_column('artists', sa.Column('image_url', sa.String(), nullable=True))
op.add_column('tracks', sa.Column('image_url', sa.String(), nullable=True))
op.add_column('tracks', sa.Column('lyrics', sa.Text(), nullable=True))
# ### end Alembic commands ###
def downgrade() -> None:
"""Downgrade schema."""
# ### commands auto generated by Alembic - please adjust! ###
op.drop_column('tracks', 'lyrics')
op.drop_column('tracks', 'image_url')
op.drop_column('artists', 'image_url')
# ### end Alembic commands ###

View File

@@ -6,9 +6,10 @@ from .models import Track, PlayHistory, Artist
from .database import SessionLocal from .database import SessionLocal
from .services.spotify_client import SpotifyClient from .services.spotify_client import SpotifyClient
from .services.reccobeats_client import ReccoBeatsClient from .services.reccobeats_client import ReccoBeatsClient
from .services.genius_client import GeniusClient
from dateutil import parser from dateutil import parser
# Initialize Spotify Client (env vars will be populated later) # Initialize Clients
def get_spotify_client(): def get_spotify_client():
return SpotifyClient( return SpotifyClient(
client_id=os.getenv("SPOTIFY_CLIENT_ID"), client_id=os.getenv("SPOTIFY_CLIENT_ID"),
@@ -19,57 +20,55 @@ def get_spotify_client():
def get_reccobeats_client(): def get_reccobeats_client():
return ReccoBeatsClient() return ReccoBeatsClient()
def get_genius_client():
return GeniusClient()
async def ensure_artists_exist(db: Session, artists_data: list): async def ensure_artists_exist(db: Session, artists_data: list):
""" """
Ensures that all artists in the list exist in the Artist table. Ensures that all artists in the list exist in the Artist table.
Returns a list of Artist objects.
""" """
artist_objects = [] artist_objects = []
for a_data in artists_data: for a_data in artists_data:
artist_id = a_data["id"] artist_id = a_data["id"]
artist = db.query(Artist).filter(Artist.id == artist_id).first() artist = db.query(Artist).filter(Artist.id == artist_id).first()
if not artist: if not artist:
# Check if image is available in this payload (rare for track-linked artists, but possible)
img = None
if "images" in a_data and a_data["images"]:
img = a_data["images"][0]["url"]
artist = Artist( artist = Artist(
id=artist_id, id=artist_id,
name=a_data["name"], name=a_data["name"],
genres=[] # Will be enriched later genres=[],
image_url=img
) )
db.add(artist) db.add(artist)
# We commit inside the loop or after, but for now we rely on the main commit
# However, to return the object correctly we might need to flush if we were doing complex things,
# but here adding to session is enough for SQLAlchemy to track it.
artist_objects.append(artist) artist_objects.append(artist)
return artist_objects return artist_objects
async def enrich_tracks(db: Session, spotify_client: SpotifyClient, recco_client: ReccoBeatsClient): async def enrich_tracks(db: Session, spotify_client: SpotifyClient, recco_client: ReccoBeatsClient, genius_client: GeniusClient):
""" """
Finds tracks missing genres (Spotify) or audio features (ReccoBeats) and enriches them. Enrichment Pipeline:
Also enriches Artists with genres. 1. Audio Features (ReccoBeats)
2. Artist Metadata: Genres & Images (Spotify)
3. Lyrics & Fallback Images (Genius)
""" """
# 1. Enrich Audio Features (via ReccoBeats) # 1. Enrich Audio Features
tracks_missing_features = db.query(Track).filter(Track.danceability == None).limit(50).all() tracks_missing_features = db.query(Track).filter(Track.danceability == None).limit(50).all()
print(f"DEBUG: Found {len(tracks_missing_features)} tracks missing audio features.")
if tracks_missing_features: if tracks_missing_features:
print(f"Enriching {len(tracks_missing_features)} tracks with audio features (ReccoBeats)...") print(f"Enriching {len(tracks_missing_features)} tracks with audio features...")
ids = [t.id for t in tracks_missing_features] ids = [t.id for t in tracks_missing_features]
features_list = await recco_client.get_audio_features(ids) features_list = await recco_client.get_audio_features(ids)
# Map features by ID
features_map = {} features_map = {}
for f in features_list: for f in features_list:
# Handle potential ID mismatch or URI format
tid = f.get("id") tid = f.get("id")
if not tid and "href" in f: if tid: features_map[tid] = f
if "tracks/" in f["href"]:
tid = f["href"].split("tracks/")[1].split("?")[0]
elif "track/" in f["href"]:
tid = f["href"].split("track/")[1].split("?")[0]
if tid:
features_map[tid] = f
updated_count = 0
for track in tracks_missing_features: for track in tracks_missing_features:
data = features_map.get(track.id) data = features_map.get(track.id)
if data: if data:
@@ -84,47 +83,68 @@ async def enrich_tracks(db: Session, spotify_client: SpotifyClient, recco_client
track.liveness = data.get("liveness") track.liveness = data.get("liveness")
track.valence = data.get("valence") track.valence = data.get("valence")
track.tempo = data.get("tempo") track.tempo = data.get("tempo")
updated_count += 1
print(f"Updated {updated_count} tracks with audio features.")
db.commit() db.commit()
# 2. Enrich Artist Genres (via Spotify Artists) # 2. Enrich Artist Genres & Images (Spotify)
# We look for artists who have no genres. Note: an artist might genuinely have no genres, artists_missing_data = db.query(Artist).filter((Artist.genres == None) | (Artist.image_url == None)).limit(50).all()
# so we might need a flag "genres_checked" in the future, but for now checking empty list is okay. if artists_missing_data:
# However, newly created artists have genres=[] (empty list) or None? print(f"Enriching {len(artists_missing_data)} artists with genres/images...")
# My model definition: genres = Column(JSON, nullable=True) artist_ids_list = [a.id for a in artists_missing_data]
# So if it is None, we haven't fetched it.
artists_missing_genres = db.query(Artist).filter(Artist.genres == None).limit(50).all()
if artists_missing_genres:
print(f"Enriching {len(artists_missing_genres)} artists with genres (Spotify)...")
artist_ids_list = [a.id for a in artists_missing_genres]
artist_data_map = {} artist_data_map = {}
# Spotify allows fetching 50 artists at a time
for i in range(0, len(artist_ids_list), 50): for i in range(0, len(artist_ids_list), 50):
chunk = artist_ids_list[i:i+50] chunk = artist_ids_list[i:i+50]
artists_data = await spotify_client.get_artists(chunk) artists_data = await spotify_client.get_artists(chunk)
for a_data in artists_data: for a_data in artists_data:
if a_data: if a_data:
artist_data_map[a_data["id"]] = a_data.get("genres", []) img = a_data["images"][0]["url"] if a_data.get("images") else None
artist_data_map[a_data["id"]] = {
"genres": a_data.get("genres", []),
"image_url": img
}
for artist in artists_missing_genres: for artist in artists_missing_data:
genres = artist_data_map.get(artist.id) data = artist_data_map.get(artist.id)
if genres is not None: if data:
artist.genres = genres if artist.genres is None: artist.genres = data["genres"]
if artist.image_url is None: artist.image_url = data["image_url"]
elif artist.genres is None:
artist.genres = [] # Prevent retry loop
db.commit()
# 3. Enrich Lyrics (Genius)
# Only fetch for tracks that have been played recently to avoid spamming Genius API
tracks_missing_lyrics = db.query(Track).filter(Track.lyrics == None).order_by(Track.updated_at.desc()).limit(10).all()
if tracks_missing_lyrics and genius_client.genius:
print(f"Enriching {len(tracks_missing_lyrics)} tracks with lyrics (Genius)...")
for track in tracks_missing_lyrics:
# We need the primary artist name
artist_name = track.artist.split(",")[0] # Heuristic: take first artist
print(f"Searching Genius for: {track.name} by {artist_name}")
data = genius_client.search_song(track.name, artist_name)
if data:
track.lyrics = data["lyrics"]
# Fallback: if we didn't get high-res art from Spotify, use Genius
if not track.image_url and data.get("image_url"):
track.image_url = data["image_url"]
else: else:
# If we couldn't fetch, set to empty list so we don't keep retrying forever (or handle errors better) track.lyrics = "" # Mark as empty to prevent retry loop
artist.genres = []
# Small sleep to be nice to API? GeniusClient is synchronous.
# We are in async function but GeniusClient is blocking. It's fine for worker.
db.commit() db.commit()
async def ingest_recently_played(db: Session): async def ingest_recently_played(db: Session):
spotify_client = get_spotify_client() spotify_client = get_spotify_client()
recco_client = get_reccobeats_client() recco_client = get_reccobeats_client()
genius_client = get_genius_client()
try: try:
items = await spotify_client.get_recently_played(limit=50) items = await spotify_client.get_recently_played(limit=50)
@@ -144,11 +164,18 @@ async def ingest_recently_played(db: Session):
if not track: if not track:
print(f"New track found: {track_data['name']}") print(f"New track found: {track_data['name']}")
# Extract Album Art
image_url = None
if track_data.get("album") and track_data["album"].get("images"):
image_url = track_data["album"]["images"][0]["url"]
track = Track( track = Track(
id=track_id, id=track_id,
name=track_data["name"], name=track_data["name"],
artist=", ".join([a["name"] for a in track_data["artists"]]), # Legacy string artist=", ".join([a["name"] for a in track_data["artists"]]),
album=track_data["album"]["name"], album=track_data["album"]["name"],
image_url=image_url,
duration_ms=track_data["duration_ms"], duration_ms=track_data["duration_ms"],
popularity=track_data["popularity"], popularity=track_data["popularity"],
raw_data=track_data raw_data=track_data
@@ -162,11 +189,8 @@ async def ingest_recently_played(db: Session):
db.add(track) db.add(track)
db.commit() db.commit()
# Ensure relationships exist even if track existed (e.g. migration) # Ensure relationships exist logic...
# Check if track has artists linked. If not (and raw_data has them), link them.
# FIX: Logic was previously indented improperly inside `if not track`.
if not track.artists and track.raw_data and "artists" in track.raw_data: if not track.artists and track.raw_data and "artists" in track.raw_data:
print(f"Backfilling artists for track {track.name}")
artist_objects = await ensure_artists_exist(db, track.raw_data["artists"]) artist_objects = await ensure_artists_exist(db, track.raw_data["artists"])
track.artists = artist_objects track.artists = artist_objects
db.commit() db.commit()
@@ -188,7 +212,7 @@ async def ingest_recently_played(db: Session):
db.commit() db.commit()
# Enrich # Enrich
await enrich_tracks(db, spotify_client, recco_client) await enrich_tracks(db, spotify_client, recco_client, genius_client)
async def run_worker(): async def run_worker():
"""Simulates a background worker loop.""" """Simulates a background worker loop."""

View File

@@ -17,6 +17,7 @@ class Artist(Base):
id = Column(String, primary_key=True, index=True) # Spotify ID id = Column(String, primary_key=True, index=True) # Spotify ID
name = Column(String) name = Column(String)
genres = Column(JSON, nullable=True) # List of genre strings genres = Column(JSON, nullable=True) # List of genre strings
image_url = Column(String, nullable=True) # Artist profile image
# Relationships # Relationships
tracks = relationship("Track", secondary=track_artists, back_populates="artists") tracks = relationship("Track", secondary=track_artists, back_populates="artists")
@@ -28,6 +29,7 @@ class Track(Base):
name = Column(String) name = Column(String)
artist = Column(String) # Display string (e.g. "Drake, Future") - kept for convenience artist = Column(String) # Display string (e.g. "Drake, Future") - kept for convenience
album = Column(String) album = Column(String)
image_url = Column(String, nullable=True) # Album art
duration_ms = Column(Integer) duration_ms = Column(Integer)
popularity = Column(Integer, nullable=True) popularity = Column(Integer, nullable=True)
@@ -53,6 +55,7 @@ class Track(Base):
genres = Column(JSON, nullable=True) genres = Column(JSON, nullable=True)
# AI Analysis fields # AI Analysis fields
lyrics = Column(Text, nullable=True) # Full lyrics from Genius
lyrics_summary = Column(String, nullable=True) lyrics_summary = Column(String, nullable=True)
genre_tags = Column(String, nullable=True) genre_tags = Column(String, nullable=True)

View File

@@ -0,0 +1,35 @@
import os
import lyricsgenius
from typing import Optional, Dict, Any
class GeniusClient:
def __init__(self):
self.access_token = os.getenv("GENIUS_ACCESS_TOKEN")
if self.access_token:
self.genius = lyricsgenius.Genius(self.access_token, verbose=False, remove_section_headers=True)
else:
print("WARNING: GENIUS_ACCESS_TOKEN not found. Lyrics enrichment will be skipped.")
self.genius = None
def search_song(self, title: str, artist: str) -> Optional[Dict[str, Any]]:
"""
Searches for a song on Genius and returns metadata + lyrics.
"""
if not self.genius:
return None
try:
# Clean up title (remove "Feat.", "Remastered", etc for better search match)
clean_title = title.split(" - ")[0].split("(")[0].strip()
song = self.genius.search_song(clean_title, artist)
if song:
return {
"lyrics": song.lyrics,
"image_url": song.song_art_image_url,
"artist_image_url": song.primary_artist.image_url
}
except Exception as e:
print(f"Genius Search Error for {title} by {artist}: {e}")
return None

View File

@@ -4,6 +4,7 @@ from datetime import datetime, timedelta
from typing import Dict, Any, List, Optional from typing import Dict, Any, List, Optional
import math import math
import numpy as np import numpy as np
from sklearn.cluster import KMeans
from ..models import PlayHistory, Track, Artist from ..models import PlayHistory, Track, Artist
@@ -78,10 +79,18 @@ class StatsService:
genre_counts = {} genre_counts = {}
album_counts = {} album_counts = {}
# Maps for resolving names later without DB hits # Maps for resolving names/images later without DB hits
track_map = {} track_map = {}
artist_map = {} artist_map = {}
album_map = {} album_map = {}
# Helper to safely get image
def get_track_image(t):
if t.image_url: return t.image_url
if t.raw_data and "album" in t.raw_data and "images" in t.raw_data["album"]:
imgs = t.raw_data["album"]["images"]
if imgs: return imgs[0].get("url")
return None
for p in plays: for p in plays:
t = p.track t = p.track
@@ -102,12 +111,15 @@ class StatsService:
album_name = t.raw_data["album"].get("name", t.album) album_name = t.raw_data["album"].get("name", t.album)
album_counts[album_id] = album_counts.get(album_id, 0) + 1 album_counts[album_id] = album_counts.get(album_id, 0) + 1
album_map[album_id] = album_name # Store tuple of (name, image_url)
if album_id not in album_map:
album_map[album_id] = {"name": album_name, "image": get_track_image(t)}
# Artist Aggregation (Iterate objects, not string) # Artist Aggregation (Iterate objects, not string)
for artist in t.artists: for artist in t.artists:
artist_counts[artist.id] = artist_counts.get(artist.id, 0) + 1 artist_counts[artist.id] = artist_counts.get(artist.id, 0) + 1
artist_map[artist.id] = artist.name if artist.id not in artist_map:
artist_map[artist.id] = {"name": artist.name, "image": artist.image_url}
# Genre Aggregation # Genre Aggregation
if artist.genres: if artist.genres:
@@ -124,19 +136,20 @@ class StatsService:
top_tracks = [ top_tracks = [
{ {
"name": track_map[tid].name, "name": track_map[tid].name,
"artist": ", ".join([a.name for a in track_map[tid].artists]), # Correct artist display "artist": ", ".join([a.name for a in track_map[tid].artists]),
"image": get_track_image(track_map[tid]),
"count": c "count": c
} }
for tid, c in sorted(track_counts.items(), key=lambda x: x[1], reverse=True)[:5] for tid, c in sorted(track_counts.items(), key=lambda x: x[1], reverse=True)[:5]
] ]
top_artists = [ top_artists = [
{"name": artist_map.get(aid, "Unknown"), "count": c} {"name": artist_map[aid]["name"], "id": aid, "image": artist_map[aid]["image"], "count": c}
for aid, c in sorted(artist_counts.items(), key=lambda x: x[1], reverse=True)[:5] for aid, c in sorted(artist_counts.items(), key=lambda x: x[1], reverse=True)[:5]
] ]
top_albums = [ top_albums = [
{"name": album_map.get(aid, "Unknown"), "count": c} {"name": album_map[aid]["name"], "image": album_map[aid]["image"], "count": c}
for aid, c in sorted(album_counts.items(), key=lambda x: x[1], reverse=True)[:5] for aid, c in sorted(album_counts.items(), key=lambda x: x[1], reverse=True)[:5]
] ]
@@ -188,7 +201,7 @@ class StatsService:
def compute_time_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: def compute_time_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
""" """
Includes Part-of-Day buckets, Listening Streaks, and Active Days stats. Includes Part-of-Day buckets, Listening Streaks, Active Days, and 2D Heatmap.
""" """
query = self.db.query(PlayHistory).filter( query = self.db.query(PlayHistory).filter(
PlayHistory.played_at >= period_start, PlayHistory.played_at >= period_start,
@@ -199,16 +212,24 @@ class StatsService:
if not plays: if not plays:
return {} return {}
# Heatmap: 7 days x 24 hours
heatmap = [[0 for _ in range(24)] for _ in range(7)]
hourly_counts = [0] * 24 hourly_counts = [0] * 24
weekday_counts = [0] * 7 weekday_counts = [0] * 7
# Spec: Morning (6-12), Afternoon (12-18), Evening (18-24), Night (0-6)
part_of_day = {"morning": 0, "afternoon": 0, "evening": 0, "night": 0} part_of_day = {"morning": 0, "afternoon": 0, "evening": 0, "night": 0}
active_dates = set() active_dates = set()
for p in plays: for p in plays:
h = p.played_at.hour h = p.played_at.hour
d = p.played_at.weekday()
# Populate Heatmap
heatmap[d][h] += 1
hourly_counts[h] += 1 hourly_counts[h] += 1
weekday_counts[p.played_at.weekday()] += 1 weekday_counts[d] += 1
active_dates.add(p.played_at.date()) active_dates.add(p.played_at.date())
if 6 <= h < 12: if 6 <= h < 12:
@@ -240,6 +261,7 @@ class StatsService:
active_days_count = len(active_dates) active_days_count = len(active_dates)
return { return {
"heatmap": heatmap, # 7x24 Matrix
"hourly_distribution": hourly_counts, "hourly_distribution": hourly_counts,
"peak_hour": hourly_counts.index(max(hourly_counts)), "peak_hour": hourly_counts.index(max(hourly_counts)),
"weekday_distribution": weekday_counts, "weekday_distribution": weekday_counts,
@@ -253,7 +275,7 @@ class StatsService:
def compute_session_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: def compute_session_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
""" """
Includes Micro-sessions, Marathon sessions, Energy Arcs, and Median metrics. Includes Micro-sessions, Marathon sessions, Energy Arcs, Median metrics, and Session List.
""" """
query = self.db.query(PlayHistory).options(joinedload(PlayHistory.track)).filter( query = self.db.query(PlayHistory).options(joinedload(PlayHistory.track)).filter(
PlayHistory.played_at >= period_start, PlayHistory.played_at >= period_start,
@@ -282,21 +304,41 @@ class StatsService:
marathon_sessions = 0 marathon_sessions = 0
energy_arcs = {"rising": 0, "falling": 0, "flat": 0, "unknown": 0} energy_arcs = {"rising": 0, "falling": 0, "flat": 0, "unknown": 0}
start_hour_dist = [0] * 24 start_hour_dist = [0] * 24
session_list = [] # Metadata for timeline
for sess in sessions: for sess in sessions:
start_t = sess[0].played_at
end_t = sess[-1].played_at
# Start time distribution # Start time distribution
start_hour_dist[sess[0].played_at.hour] += 1 start_hour_dist[start_t.hour] += 1
# Durations # Durations
if len(sess) > 1: if len(sess) > 1:
duration = (sess[-1].played_at - sess[0].played_at).total_seconds() / 60 duration = (end_t - start_t).total_seconds() / 60
lengths_min.append(duration) lengths_min.append(duration)
else: else:
lengths_min.append(3.0) # Approx single song duration = 3.0 # Approx single song
lengths_min.append(duration)
# Types # Types
if len(sess) <= 3: micro_sessions += 1 sess_type = "Standard"
if len(sess) >= 20: marathon_sessions += 1 if len(sess) <= 3:
micro_sessions += 1
sess_type = "Micro"
elif len(sess) >= 20:
marathon_sessions += 1
sess_type = "Marathon"
# Store Session Metadata
session_list.append({
"start_time": start_t.isoformat(),
"end_time": end_t.isoformat(),
"duration_minutes": round(duration, 1),
"track_count": len(sess),
"type": sess_type
})
# Energy Arc # Energy Arc
first_t = sess[0].track first_t = sess[0].track
@@ -326,12 +368,13 @@ class StatsService:
"start_hour_distribution": start_hour_dist, "start_hour_distribution": start_hour_dist,
"micro_session_rate": round(micro_sessions / len(sessions), 2), "micro_session_rate": round(micro_sessions / len(sessions), 2),
"marathon_session_rate": round(marathon_sessions / len(sessions), 2), "marathon_session_rate": round(marathon_sessions / len(sessions), 2),
"energy_arcs": energy_arcs "energy_arcs": energy_arcs,
"session_list": session_list
} }
def compute_vibe_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: def compute_vibe_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
""" """
Aggregates Audio Features + Calculates Whiplash, Percentiles, and Profiles. Aggregates Audio Features + Calculates Whiplash + Clustering + Harmonic Profile.
""" """
plays = self.db.query(PlayHistory).filter( plays = self.db.query(PlayHistory).filter(
PlayHistory.played_at >= period_start, PlayHistory.played_at >= period_start,
@@ -349,6 +392,14 @@ class StatsService:
feature_keys = ["energy", "valence", "danceability", "tempo", "acousticness", feature_keys = ["energy", "valence", "danceability", "tempo", "acousticness",
"instrumentalness", "liveness", "speechiness", "loudness"] "instrumentalness", "liveness", "speechiness", "loudness"]
features = {k: [] for k in feature_keys} features = {k: [] for k in feature_keys}
# For Clustering: List of [energy, valence, danceability, acousticness]
cluster_data = []
# For Harmonic & Tempo
keys = []
modes = []
tempo_zones = {"chill": 0, "groove": 0, "hype": 0}
# 2. Transition Arrays (for Whiplash) # 2. Transition Arrays (for Whiplash)
transitions = {"tempo": [], "energy": [], "valence": []} transitions = {"tempo": [], "energy": [], "valence": []}
@@ -364,6 +415,20 @@ class StatsService:
val = getattr(t, key, None) val = getattr(t, key, None)
if val is not None: if val is not None:
features[key].append(val) features[key].append(val)
# Cluster Data (only if all 4 exist)
if all(getattr(t, k) is not None for k in ["energy", "valence", "danceability", "acousticness"]):
cluster_data.append([t.energy, t.valence, t.danceability, t.acousticness])
# Harmonic
if t.key is not None: keys.append(t.key)
if t.mode is not None: modes.append(t.mode)
# Tempo Zones
if t.tempo is not None:
if t.tempo < 100: tempo_zones["chill"] += 1
elif t.tempo < 130: tempo_zones["groove"] += 1
else: tempo_zones["hype"] += 1
# Calculate Transitions (Whiplash) # Calculate Transitions (Whiplash)
if i > 0 and previous_track: if i > 0 and previous_track:
@@ -381,12 +446,13 @@ class StatsService:
# Calculate Stats (Mean, Std, Percentiles) # Calculate Stats (Mean, Std, Percentiles)
stats = {} stats = {}
for key, values in features.items(): for key, values in features.items():
if values: valid = [v for v in values if v is not None]
stats[f"avg_{key}"] = float(np.mean(values)) if valid:
stats[f"std_{key}"] = float(np.std(values)) stats[f"avg_{key}"] = float(np.mean(valid))
stats[f"p10_{key}"] = float(np.percentile(values, 10)) stats[f"std_{key}"] = float(np.std(valid))
stats[f"p50_{key}"] = float(np.percentile(values, 50)) # Median stats[f"p10_{key}"] = float(np.percentile(valid, 10))
stats[f"p90_{key}"] = float(np.percentile(values, 90)) stats[f"p50_{key}"] = float(np.percentile(valid, 50)) # Median
stats[f"p90_{key}"] = float(np.percentile(valid, 90))
else: else:
stats[f"avg_{key}"] = None stats[f"avg_{key}"] = None
@@ -396,31 +462,97 @@ class StatsService:
"x": round(stats["avg_valence"], 2), "x": round(stats["avg_valence"], 2),
"y": round(stats["avg_energy"], 2) "y": round(stats["avg_energy"], 2)
} }
# Consistency
avg_std = (stats.get("std_energy", 0) + stats.get("std_valence", 0)) / 2 avg_std = (stats.get("std_energy", 0) + stats.get("std_valence", 0)) / 2
stats["consistency_score"] = round(1.0 - avg_std, 2) stats["consistency_score"] = round(1.0 - avg_std, 2)
# Rhythm Profile
if stats.get("avg_tempo") is not None and stats.get("avg_danceability") is not None: if stats.get("avg_tempo") is not None and stats.get("avg_danceability") is not None:
stats["rhythm_profile"] = { stats["rhythm_profile"] = {
"avg_tempo": round(stats["avg_tempo"], 1), "avg_tempo": round(stats["avg_tempo"], 1),
"avg_danceability": round(stats["avg_danceability"], 2) "avg_danceability": round(stats["avg_danceability"], 2)
} }
# Texture Profile
if stats.get("avg_acousticness") is not None and stats.get("avg_instrumentalness") is not None: if stats.get("avg_acousticness") is not None and stats.get("avg_instrumentalness") is not None:
stats["texture_profile"] = { stats["texture_profile"] = {
"acousticness": round(stats["avg_acousticness"], 2), "acousticness": round(stats["avg_acousticness"], 2),
"instrumentalness": round(stats["avg_instrumentalness"], 2) "instrumentalness": round(stats["avg_instrumentalness"], 2)
} }
# Whiplash Scores # Whiplash
stats["whiplash"] = {} stats["whiplash"] = {}
for k in ["tempo", "energy", "valence"]: for k in ["tempo", "energy", "valence"]:
if transitions[k]: if transitions[k]:
stats["whiplash"][k] = round(float(np.mean(transitions[k])), 2) stats["whiplash"][k] = round(float(np.mean(transitions[k])), 2)
else: else:
stats["whiplash"][k] = 0 stats["whiplash"][k] = 0
# Tempo Zones
total_tempo = sum(tempo_zones.values())
if total_tempo > 0:
stats["tempo_zones"] = {k: round(v / total_tempo, 2) for k, v in tempo_zones.items()}
else:
stats["tempo_zones"] = {}
# Harmonic Profile
if modes:
major_count = len([m for m in modes if m == 1])
stats["harmonic_profile"] = {
"major_pct": round(major_count / len(modes), 2),
"minor_pct": round((len(modes) - major_count) / len(modes), 2)
}
if keys:
# Map integers to pitch class notation
pitch_class = ["C", "C#", "D", "D#", "E", "F", "F#", "G", "G#", "A", "A#", "B"]
key_counts = {}
for k in keys:
if 0 <= k < 12:
label = pitch_class[k]
key_counts[label] = key_counts.get(label, 0) + 1
stats["top_keys"] = [{"key": k, "count": v} for k, v in sorted(key_counts.items(), key=lambda x: x[1], reverse=True)[:3]]
# CLUSTERING (K-Means)
if len(cluster_data) >= 5: # Need enough data points
try:
# Features: energy, valence, danceability, acousticness
kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
labels = kmeans.fit_predict(cluster_data)
# Analyze clusters
clusters = []
for i in range(3):
mask = (labels == i)
count = np.sum(mask)
if count == 0: continue
centroid = kmeans.cluster_centers_[i]
share = count / len(cluster_data)
# Heuristic Naming
c_energy, c_valence, c_dance, c_acoustic = centroid
name = "Mixed Vibe"
if c_energy > 0.7: name = "High Energy"
elif c_acoustic > 0.7: name = "Acoustic / Chill"
elif c_valence < 0.3: name = "Melancholy"
elif c_dance > 0.7: name = "Dance / Groove"
clusters.append({
"name": name,
"share": round(share, 2),
"features": {
"energy": round(c_energy, 2),
"valence": round(c_valence, 2),
"danceability": round(c_dance, 2),
"acousticness": round(c_acoustic, 2)
}
})
# Sort by share
stats["clusters"] = sorted(clusters, key=lambda x: x["share"], reverse=True)
except Exception as e:
print(f"Clustering failed: {e}")
stats["clusters"] = []
else:
stats["clusters"] = []
return stats return stats
@@ -448,9 +580,11 @@ class StatsService:
if not years: if not years:
return {"musical_age": None} return {"musical_age": None}
# Musical Age (Weighted Average)
avg_year = sum(years) / len(years) avg_year = sum(years) / len(years)
current_year = datetime.utcnow().year current_year = datetime.utcnow().year
# Decade Distribution
decades = {} decades = {}
for y in years: for y in years:
dec = (y // 10) * 10 dec = (y // 10) * 10
@@ -463,17 +597,18 @@ class StatsService:
return { return {
"musical_age": int(avg_year), "musical_age": int(avg_year),
"nostalgia_gap": int(current_year - avg_year), "nostalgia_gap": int(current_year - avg_year),
"freshness_score": dist.get(f"{int(current_year / 10) * 10}s", 0), "freshness_score": dist.get(f"{int(current_year / 10) * 10}s", 0), # Share of current decade
"decade_distribution": dist "decade_distribution": dist
} }
def compute_skip_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: def compute_skip_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
""" """
Implements boredom skip detection. Implements boredom skip detection:
(next_track.played_at - current_track.played_at) < (current_track.duration_ms / 1000 - 10s)
""" """
query = self.db.query(PlayHistory).filter( query = self.db.query(PlayHistory).filter(
PlayHistory.played_at >= period_start, PlayHistory.played_at >= period_start,
PlayHistory.played_at < period_end PlayHistory.played_at <= period_end
).order_by(PlayHistory.played_at.asc()) ).order_by(PlayHistory.played_at.asc())
plays = query.all() plays = query.all()
@@ -485,10 +620,7 @@ class StatsService:
tracks = self.db.query(Track).filter(Track.id.in_(track_ids)).all() tracks = self.db.query(Track).filter(Track.id.in_(track_ids)).all()
track_map = {t.id: t for t in tracks} track_map = {t.id: t for t in tracks}
# Denominator: transitions, which is plays - 1 for i in range(len(plays) - 1):
transitions_count = len(plays) - 1
for i in range(transitions_count):
current_play = plays[i] current_play = plays[i]
next_play = plays[i+1] next_play = plays[i+1]
track = track_map.get(current_play.track_id) track = track_map.get(current_play.track_id)
@@ -497,28 +629,31 @@ class StatsService:
continue continue
diff_seconds = (next_play.played_at - current_play.played_at).total_seconds() diff_seconds = (next_play.played_at - current_play.played_at).total_seconds()
duration_sec = track.duration_ms / 1000.0
# Logic: If diff < (duration - 10s), it's a skip. # Logic: If diff < (duration - 10s), it's a skip.
# AND it must be a "valid" listening attempt (e.g. > 30s) # Convert duration to seconds
# AND it shouldn't be a huge gap (e.g. paused for 2 hours then hit next) duration_sec = track.duration_ms / 1000.0
if 30 < diff_seconds < (duration_sec - 10): # Also ensure diff isn't negative or weirdly small (re-plays)
# And assume "listening" means diff > 30s at least?
# Spec says "Spotify only returns 30s+".
if diff_seconds < (duration_sec - 10):
skips += 1 skips += 1
return { return {
"total_skips": skips, "total_skips": skips,
"skip_rate": round(skips / transitions_count, 3) if transitions_count > 0 else 0 "skip_rate": round(skips / len(plays), 3)
} }
def compute_context_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: def compute_context_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
""" """
Analyzes context_uri and switching rate. Analyzes context_uri to determine if user listens to Playlists, Albums, or Artists.
""" """
query = self.db.query(PlayHistory).filter( query = self.db.query(PlayHistory).filter(
PlayHistory.played_at >= period_start, PlayHistory.played_at >= period_start,
PlayHistory.played_at < period_end PlayHistory.played_at <= period_end
).order_by(PlayHistory.played_at.asc()) )
plays = query.all() plays = query.all()
if not plays: if not plays:
@@ -526,32 +661,31 @@ class StatsService:
context_counts = {"playlist": 0, "album": 0, "artist": 0, "collection": 0, "unknown": 0} context_counts = {"playlist": 0, "album": 0, "artist": 0, "collection": 0, "unknown": 0}
unique_contexts = {} unique_contexts = {}
context_switches = 0
last_context = None
for p in plays: for p in plays:
uri = p.context_uri if not p.context_uri:
if not uri:
context_counts["unknown"] += 1 context_counts["unknown"] += 1
uri = "unknown" continue
else:
if "playlist" in uri: context_counts["playlist"] += 1
elif "album" in uri: context_counts["album"] += 1
elif "artist" in uri: context_counts["artist"] += 1
elif "collection" in uri: context_counts["collection"] += 1
else: context_counts["unknown"] += 1
if uri != "unknown": # Count distinct contexts for loyalty
unique_contexts[uri] = unique_contexts.get(uri, 0) + 1 unique_contexts[p.context_uri] = unique_contexts.get(p.context_uri, 0) + 1
# Switch detection if "playlist" in p.context_uri:
if last_context and uri != last_context: context_counts["playlist"] += 1
context_switches += 1 elif "album" in p.context_uri:
last_context = uri context_counts["album"] += 1
elif "artist" in p.context_uri:
context_counts["artist"] += 1
elif "collection" in p.context_uri:
# "Liked Songs" usually shows up as collection
context_counts["collection"] += 1
else:
context_counts["unknown"] += 1
total = len(plays) total = len(plays)
breakdown = {k: round(v / total, 2) for k, v in context_counts.items()} breakdown = {k: round(v / total, 2) for k, v in context_counts.items()}
# Top 5 Contexts (Requires resolving URI to name, possibly missing metadata here)
sorted_contexts = sorted(unique_contexts.items(), key=lambda x: x[1], reverse=True)[:5] sorted_contexts = sorted(unique_contexts.items(), key=lambda x: x[1], reverse=True)[:5]
return { return {
@@ -559,17 +693,16 @@ class StatsService:
"album_purist_score": breakdown.get("album", 0), "album_purist_score": breakdown.get("album", 0),
"playlist_dependency": breakdown.get("playlist", 0), "playlist_dependency": breakdown.get("playlist", 0),
"context_loyalty": round(len(plays) / len(unique_contexts), 2) if unique_contexts else 0, "context_loyalty": round(len(plays) / len(unique_contexts), 2) if unique_contexts else 0,
"context_switching_rate": round(context_switches / (total - 1), 2) if total > 1 else 0,
"top_context_uris": [{"uri": k, "count": v} for k, v in sorted_contexts] "top_context_uris": [{"uri": k, "count": v} for k, v in sorted_contexts]
} }
def compute_taste_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: def compute_taste_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
""" """
Mainstream vs. Hipster analysis. Mainstream vs. Hipster analysis based on Track.popularity (0-100).
""" """
query = self.db.query(PlayHistory).filter( query = self.db.query(PlayHistory).filter(
PlayHistory.played_at >= period_start, PlayHistory.played_at >= period_start,
PlayHistory.played_at < period_end PlayHistory.played_at <= period_end
) )
plays = query.all() plays = query.all()
if not plays: return {} if not plays: return {}
@@ -602,47 +735,38 @@ class StatsService:
def compute_lifecycle_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: def compute_lifecycle_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
""" """
Discovery, Recurrence, Comebacks, Obsessions. Determines if tracks are 'New Discoveries' or 'Old Favorites'.
""" """
# 1. Current plays # 1. Get tracks played in this period
current_plays = self.db.query(PlayHistory).filter( current_plays = self.db.query(PlayHistory).filter(
PlayHistory.played_at >= period_start, PlayHistory.played_at >= period_start,
PlayHistory.played_at < period_end PlayHistory.played_at <= period_end
).all() ).all()
if not current_plays: return {} if not current_plays: return {}
current_track_ids = set([p.track_id for p in current_plays]) current_track_ids = set([p.track_id for p in current_plays])
# 2. Historical check # 2. Check if these tracks were played BEFORE period_start
# We find which of the current_track_ids exist in history < period_start
old_tracks_query = self.db.query(distinct(PlayHistory.track_id)).filter( old_tracks_query = self.db.query(distinct(PlayHistory.track_id)).filter(
PlayHistory.track_id.in_(current_track_ids), PlayHistory.track_id.in_(current_track_ids),
PlayHistory.played_at < period_start PlayHistory.played_at < period_start
) )
old_track_ids = set([r[0] for r in old_tracks_query.all()]) old_track_ids = set([r[0] for r in old_tracks_query.all()])
# 3. Discovery # 3. Calculate Discovery
new_discoveries = current_track_ids - old_track_ids new_discoveries = current_track_ids - old_track_ids
discovery_count = len(new_discoveries)
# 4. Obsessions (Tracks with > 5 plays in period)
track_counts = {} # Calculate plays on new discoveries
for p in current_plays:
track_counts[p.track_id] = track_counts.get(p.track_id, 0) + 1
obsessions = [tid for tid, count in track_counts.items() if count >= 5]
# 5. Comeback Detection (Old tracks not played in last 30 days)
# Simplified: If in old_track_ids but NOT in last 30 days before period_start?
# That requires a gap check. For now, we will mark 'recurrence' as general relistening.
plays_on_new = len([p for p in current_plays if p.track_id in new_discoveries]) plays_on_new = len([p for p in current_plays if p.track_id in new_discoveries])
total_plays = len(current_plays) total_plays = len(current_plays)
return { return {
"discovery_count": len(new_discoveries), "discovery_count": discovery_count,
"discovery_rate": round(plays_on_new / total_plays, 3) if total_plays > 0 else 0, "discovery_rate": round(plays_on_new / total_plays, 3) if total_plays > 0 else 0,
"recurrence_rate": round((total_plays - plays_on_new) / total_plays, 3) if total_plays > 0 else 0, "recurrence_rate": round((total_plays - plays_on_new) / total_plays, 3) if total_plays > 0 else 0
"obsession_count": len(obsessions),
"obsession_rate": round(len(obsessions) / len(current_track_ids), 3) if current_track_ids else 0
} }
def compute_explicit_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: def compute_explicit_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
@@ -651,7 +775,7 @@ class StatsService:
""" """
query = self.db.query(PlayHistory).options(joinedload(PlayHistory.track)).filter( query = self.db.query(PlayHistory).options(joinedload(PlayHistory.track)).filter(
PlayHistory.played_at >= period_start, PlayHistory.played_at >= period_start,
PlayHistory.played_at < period_end PlayHistory.played_at <= period_end
) )
plays = query.all() plays = query.all()
@@ -665,14 +789,24 @@ class StatsService:
for p in plays: for p in plays:
h = p.played_at.hour h = p.played_at.hour
hourly_total[h] += 1 hourly_total[h] += 1
# Check raw_data for explicit flag
t = p.track t = p.track
is_explicit = False
if t.raw_data and t.raw_data.get("explicit"): if t.raw_data and t.raw_data.get("explicit"):
is_explicit = True
if is_explicit:
explicit_count += 1 explicit_count += 1
hourly_explicit[h] += 1 hourly_explicit[h] += 1
# Calculate hourly percentages
hourly_rates = [] hourly_rates = []
for i in range(24): for i in range(24):
hourly_rates.append(round(hourly_explicit[i] / hourly_total[i], 2) if hourly_total[i] > 0 else 0.0) if hourly_total[i] > 0:
hourly_rates.append(round(hourly_explicit[i] / hourly_total[i], 2))
else:
hourly_rates.append(0.0)
return { return {
"explicit_rate": round(explicit_count / total_plays, 3), "explicit_rate": round(explicit_count / total_plays, 3),
@@ -681,6 +815,7 @@ class StatsService:
} }
def generate_full_report(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: def generate_full_report(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
# 1. Calculate all current stats
current_stats = { current_stats = {
"period": {"start": period_start.isoformat(), "end": period_end.isoformat()}, "period": {"start": period_start.isoformat(), "end": period_end.isoformat()},
"volume": self.compute_volume_stats(period_start, period_end), "volume": self.compute_volume_stats(period_start, period_end),
@@ -695,7 +830,9 @@ class StatsService:
"skips": self.compute_skip_stats(period_start, period_end) "skips": self.compute_skip_stats(period_start, period_end)
} }
# 2. Calculate Comparison
current_stats["comparison"] = self.compute_comparison(current_stats, period_start, period_end) current_stats["comparison"] = self.compute_comparison(current_stats, period_start, period_end)
return current_stats return current_stats
def _empty_volume_stats(self): def _empty_volume_stats(self):
@@ -710,4 +847,4 @@ class StatsService:
def _pct_change(self, curr, prev): def _pct_change(self, curr, prev):
if prev == 0: if prev == 0:
return 100.0 if curr > 0 else 0.0 return 100.0 if curr > 0 else 0.0
return round(((curr - prev) / prev) * 100, 1) return round(((curr - prev) / prev) * 100, 1)

View File

@@ -11,3 +11,4 @@ python-dateutil==2.9.0.post0
requests==2.31.0 requests==2.31.0
alembic==1.13.1 alembic==1.13.1
scikit-learn==1.4.0 scikit-learn==1.4.0
lyricsgenius==3.0.1