Fixed and added all the stats_service.py methods

This commit is contained in:
bnair123
2025-12-25 17:48:41 +04:00
parent d63a05fb72
commit 508d001d7e
4 changed files with 580 additions and 245 deletions

View File

@@ -1,12 +1,15 @@
from fastapi import FastAPI, Depends from fastapi import FastAPI, Depends, HTTPException, BackgroundTasks
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from datetime import datetime, timedelta
from typing import List, Optional
from dotenv import load_dotenv
from .database import engine, Base, get_db from .database import engine, Base, get_db
from .models import PlayHistory as PlayHistoryModel, Track as TrackModel from .models import PlayHistory as PlayHistoryModel, Track as TrackModel, AnalysisSnapshot
from . import schemas from . import schemas
from .ingest import ingest_recently_played from .ingest import ingest_recently_played
import asyncio from .services.stats_service import StatsService
from typing import List from .services.narrative_service import NarrativeService
from dotenv import load_dotenv
load_dotenv() load_dotenv()
@@ -24,13 +27,68 @@ def get_history(limit: int = 50, db: Session = Depends(get_db)):
history = db.query(PlayHistoryModel).order_by(PlayHistoryModel.played_at.desc()).limit(limit).all() history = db.query(PlayHistoryModel).order_by(PlayHistoryModel.played_at.desc()).limit(limit).all()
return history return history
@app.post("/trigger-ingest")
async def trigger_ingest(db: Session = Depends(get_db)):
"""Manually trigger the ingestion process (useful for testing)"""
await ingest_recently_played(db)
return {"status": "Ingestion triggered"}
@app.get("/tracks", response_model=List[schemas.Track]) @app.get("/tracks", response_model=List[schemas.Track])
def get_tracks(limit: int = 50, db: Session = Depends(get_db)): def get_tracks(limit: int = 50, db: Session = Depends(get_db)):
tracks = db.query(TrackModel).limit(limit).all() tracks = db.query(TrackModel).limit(limit).all()
return tracks return tracks
@app.post("/trigger-ingest")
async def trigger_ingest(background_tasks: BackgroundTasks, db: Session = Depends(get_db)):
"""Triggers Spotify ingestion in the background."""
background_tasks.add_task(ingest_recently_played, db)
return {"status": "Ingestion started in background"}
@app.post("/trigger-analysis")
def trigger_analysis(
days: int = 30,
model_name: str = "gemini-2.5-flash",
db: Session = Depends(get_db)
):
"""
Runs the full analysis pipeline (Stats + LLM) for the last X days.
Returns the computed metrics and narrative immediately.
"""
try:
end_date = datetime.utcnow()
start_date = end_date - timedelta(days=days)
# 1. Compute Stats
stats_service = StatsService(db)
stats_json = stats_service.generate_full_report(start_date, end_date)
if stats_json["volume"]["total_plays"] == 0:
raise HTTPException(status_code=404, detail="No plays found in the specified period.")
# 2. Generate Narrative
narrative_service = NarrativeService(model_name=model_name)
narrative_json = narrative_service.generate_narrative(stats_json)
# 3. Save Snapshot
snapshot = AnalysisSnapshot(
period_start=start_date,
period_end=end_date,
period_label=f"last_{days}_days",
metrics_payload=stats_json,
narrative_report=narrative_json,
model_used=model_name
)
db.add(snapshot)
db.commit()
db.refresh(snapshot)
return {
"status": "success",
"snapshot_id": snapshot.id,
"period": {"start": start_date, "end": end_date},
"metrics": stats_json,
"narrative": narrative_json
}
except Exception as e:
print(f"Analysis Failed: {e}")
raise HTTPException(status_code=500, detail=str(e))
@app.get("/snapshots")
def get_snapshots(limit: int = 10, db: Session = Depends(get_db)):
"""Retrieve past analysis snapshots."""
return db.query(AnalysisSnapshot).order_by(AnalysisSnapshot.date.desc()).limit(limit).all()

View File

@@ -18,43 +18,35 @@ class NarrativeService:
return {"error": "Missing API Key"} return {"error": "Missing API Key"}
prompt = f""" prompt = f"""
You are analyzing a user's Spotify listening data. Below is a JSON summary of metrics I've computed. Your job is to: You are a witty, insightful, and slightly snarky music critic analyzing a user's listening history.
Below is a JSON summary of their listening data.
1. Write a narrative "Vibe Check" (2-3 paragraphs) describing their overall listening personality this period. Your goal is to generate a report that feels like a 'Spotify Wrapped' but deeper and more honest.
2. Identify 3-5 notable patterns or anomalies.
3. Provide a "Musical Persona" label (e.g., "Late-Night Binge Listener", "Genre Chameleon", "Album Purist").
4. Write a brief, playful "roast" (1-2 sentences) based on the data.
Guidelines: Please output your response in strict JSON format with the following keys:
- Do NOT recalculate any numbers. 1. "vibe_check": (String) 2-3 paragraphs describing their overall listening personality.
- Use specific metrics to support observations (e.g., "Your whiplash score of 18.3 BPM suggests..."). 2. "patterns": (List of Strings) 3-5 specific observations based on the data (e.g., "You listen to sad music on Tuesdays", "Your Whiplash Score is high").
- Keep tone conversational but insightful. 3. "persona": (String) A creative label for the user (e.g., "The Genre Chameleon", "Nostalgic Dad-Rocker", "Algorithm Victim").
- Avoid mental health claims; stick to behavioral descriptors. 4. "roast": (String) A playful, harmlessly mean roast about their taste (1-2 sentences).
- Highlight both positive patterns and quirks. 5. "era_insight": (String) A specific comment on their 'Musical Age' and 'Nostalgia Gap'.
Data: GUIDELINES:
- **Use the Metrics:** Do not just say "You like pop." Say "Your Mainstream Score of 85% suggests you live on the Top 40."
- **Whiplash Score:** If 'whiplash' > 20, comment on their chaotic transitions.
- **Hipster Score:** If 'hipster_score' > 50, call them pretentious; if < 10, call them basic.
- **Comparison:** Use the 'comparison' block to mention if they are listening more/less or if their mood (valence/energy) has shifted.
- **Tone:** Conversational, fun, slightly judgmental but good-natured.
DATA:
{json.dumps(stats_json, indent=2)} {json.dumps(stats_json, indent=2)}
Output Format (return valid JSON): OUTPUT (JSON):
{{
"vibe_check": "...",
"patterns": ["...", "..."],
"persona": "...",
"roast": "..."
}}
""" """
try: try:
# Handle full model path if passed or default short name model = genai.GenerativeModel(self.model_name)
# The library often accepts 'gemini-2.5-flash' but list_models returns 'models/gemini-2.5-flash'
model_id = self.model_name
if not model_id.startswith("models/") and "/" not in model_id:
# Try simple name, if it fails user might need to pass 'models/...'
pass
model = genai.GenerativeModel(model_id)
response = model.generate_content(prompt) response = model.generate_content(prompt)
# Clean up response to ensure valid JSON (sometimes LLMs add markdown blocks) # Clean up response to ensure valid JSON
text = response.text.strip() text = response.text.strip()
if text.startswith("```json"): if text.startswith("```json"):
text = text.replace("```json", "").replace("```", "") text = text.replace("```json", "").replace("```", "")
@@ -64,4 +56,4 @@ Output Format (return valid JSON):
return json.loads(text) return json.loads(text)
except Exception as e: except Exception as e:
return {"error": str(e), "raw_response": response.text if 'response' in locals() else "No response"} return {"error": str(e), "raw_response": "Error generating narrative."}

View File

@@ -1,5 +1,5 @@
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from sqlalchemy import func, distinct, desc from sqlalchemy import func, distinct, desc, joinedload
from datetime import datetime, timedelta from datetime import datetime, timedelta
from typing import Dict, Any, List from typing import Dict, Any, List
import math import math
@@ -11,11 +11,68 @@ class StatsService:
def __init__(self, db: Session): def __init__(self, db: Session):
self.db = db self.db = db
from sqlalchemy.orm import joinedload # Add this to imports
def compute_comparison(self, current_stats: Dict[str, Any], period_start: datetime, period_end: datetime) -> Dict[
str, Any]:
"""
Calculates deltas vs the previous period of the same length.
"""
duration = period_end - period_start
prev_end = period_start
prev_start = prev_end - duration
# We only need key metrics for comparison, not the full heavy report
# Let's re-use existing methods but strictly for the previous window
# 1. Volume Comparison
prev_volume = self.compute_volume_stats(prev_start, prev_end)
# 2. Vibe Comparison (Just energy/valence/popularity)
prev_vibe = self.compute_vibe_stats(prev_start, prev_end)
prev_taste = self.compute_taste_stats(prev_start, prev_end)
# Calculate Deltas
deltas = {}
# Plays
curr_plays = current_stats["volume"]["total_plays"]
prev_plays_count = prev_volume["total_plays"]
deltas["plays_delta"] = curr_plays - prev_plays_count
deltas["plays_pct_change"] = round(((curr_plays - prev_plays_count) / prev_plays_count) * 100,
1) if prev_plays_count else 0
# Energy & Valence
if "mood_quadrant" in current_stats["vibe"] and "mood_quadrant" in prev_vibe:
curr_e = current_stats["vibe"]["mood_quadrant"]["y"]
prev_e = prev_vibe["mood_quadrant"]["y"]
deltas["energy_delta"] = round(curr_e - prev_e, 2)
curr_v = current_stats["vibe"]["mood_quadrant"]["x"]
prev_v = prev_vibe["mood_quadrant"]["x"]
deltas["valence_delta"] = round(curr_v - prev_v, 2)
# Popularity
if "avg_popularity" in current_stats["taste"] and "avg_popularity" in prev_taste:
deltas["popularity_delta"] = round(current_stats["taste"]["avg_popularity"] - prev_taste["avg_popularity"],
1)
return {
"previous_period": {
"start": prev_start.isoformat(),
"end": prev_end.isoformat()
},
"deltas": deltas
}
def compute_volume_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: def compute_volume_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
""" """
Calculates volume metrics: Total Plays, Unique Tracks, Artists, etc. Calculates volume metrics including Concentration (HHI, Gini) and One-and-Done rates.
""" """
query = self.db.query(PlayHistory).filter( # Eager load tracks AND artists to fix the "Artist String Problem" and performance
query = self.db.query(PlayHistory).options(
joinedload(PlayHistory.track).joinedload(Track.artists)
).filter(
PlayHistory.played_at >= period_start, PlayHistory.played_at >= period_start,
PlayHistory.played_at <= period_end PlayHistory.played_at <= period_end
) )
@@ -24,167 +81,94 @@ class StatsService:
if total_plays == 0: if total_plays == 0:
return { return {
"total_plays": 0, "total_plays": 0, "estimated_minutes": 0, "unique_tracks": 0,
"estimated_minutes": 0, "unique_artists": 0, "unique_albums": 0, "unique_genres": 0,
"unique_tracks": 0, "top_tracks": [], "top_artists": [], "top_genres": [],
"unique_artists": 0, "repeat_rate": 0, "concentration": {}
"unique_albums": 0,
"unique_genres": 0,
"top_tracks": [],
"top_artists": [],
"repeat_rate": 0,
"concentration": {}
} }
# Calculate Duration (Estimated)
# Note: We query tracks to get duration.
# Ideally we join, but eager loading might be heavy. Let's do a join or simple loop.
# Efficient approach: Get all track IDs from plays, fetch Track objects in bulk map.
track_ids = [p.track_id for p in plays]
tracks = self.db.query(Track).filter(Track.id.in_(set(track_ids))).all()
track_map = {t.id: t for t in tracks}
total_ms = 0 total_ms = 0
unique_track_ids = set() track_counts = {}
unique_artist_ids = set() artist_counts = {}
unique_album_names = set() # Spotify doesn't give album ID in PlayHistory directly unless joined, track has album name string.
# Ideally track has raw_data['album']['id'].
unique_album_ids = set()
genre_counts = {} genre_counts = {}
album_ids = set()
# For Top Lists
track_play_counts = {}
artist_play_counts = {}
for p in plays: for p in plays:
t = track_map.get(p.track_id) t = p.track
if t: if not t: continue
total_ms += t.duration_ms
unique_track_ids.add(t.id)
# Top Tracks total_ms += t.duration_ms if t.duration_ms else 0
track_play_counts[t.id] = track_play_counts.get(t.id, 0) + 1
# Artists (using relation) # Track Counts
# Note: This might cause N+1 query if not eager loaded. track_counts[t.id] = track_counts.get(t.id, 0) + 1
# For strictly calculation, accessing t.artists (lazy load) loop might be slow for 1000s of plays.
# Optimization: Join PlayHistory -> Track -> Artist in query.
# Let's rely on raw_data for speed if relation loading is slow, # Album Counts (using raw_data ID if available, else name)
# OR Assume we accept some latency. if t.raw_data and "album" in t.raw_data and "id" in t.raw_data["album"]:
# Better: Pre-fetch artist connections or use the new tables properly. album_ids.add(t.raw_data["album"]["id"])
# Let's use the object relation for correctness as per plan. else:
album_ids.add(t.album)
# Artist Counts (Iterate objects, not string)
for artist in t.artists: for artist in t.artists:
unique_artist_ids.add(artist.id) artist_counts[artist.id] = artist_counts.get(artist.id, 0) + 1
artist_play_counts[artist.id] = artist_play_counts.get(artist.id, 0) + 1
if artist.genres: if artist.genres:
for g in artist.genres: for g in artist.genres:
genre_counts[g] = genre_counts.get(g, 0) + 1 genre_counts[g] = genre_counts.get(g, 0) + 1
if t.raw_data and "album" in t.raw_data: # Derived Metrics
unique_album_ids.add(t.raw_data["album"]["id"]) unique_tracks = len(track_counts)
else: one_and_done = len([c for c in track_counts.values() if c == 1])
unique_album_ids.add(t.album) # Fallback
estimated_minutes = total_ms / 60000 # Top Lists
top_tracks = [
{"name": self.db.query(Track).get(tid).name, "artist": self.db.query(Track).get(tid).artist, "count": c}
for tid, c in sorted(track_counts.items(), key=lambda x: x[1], reverse=True)[:5]
]
# Top 5 Tracks top_artist_ids = sorted(artist_counts.items(), key=lambda x: x[1], reverse=True)[:5]
sorted_tracks = sorted(track_play_counts.items(), key=lambda x: x[1], reverse=True)[:5] # Fetch artist names efficiently
top_tracks = []
for tid, count in sorted_tracks:
t = track_map.get(tid)
top_tracks.append({
"name": t.name,
"artist": t.artist, # Display string
"count": count
})
# Top 5 Artists
# Need to fetch Artist names
top_artist_ids = sorted(artist_play_counts.items(), key=lambda x: x[1], reverse=True)[:5]
top_artists_objs = self.db.query(Artist).filter(Artist.id.in_([x[0] for x in top_artist_ids])).all() top_artists_objs = self.db.query(Artist).filter(Artist.id.in_([x[0] for x in top_artist_ids])).all()
artist_name_map = {a.id: a.name for a in top_artists_objs} artist_map = {a.id: a.name for a in top_artists_objs}
top_artists = [{"name": artist_map.get(aid, "Unknown"), "count": c} for aid, c in top_artist_ids]
top_artists = [] top_genres = [{"name": k, "count": v} for k, v in
for aid, count in top_artist_ids: sorted(genre_counts.items(), key=lambda x: x[1], reverse=True)[:5]]
top_artists.append({
"name": artist_name_map.get(aid, "Unknown"),
"count": count
})
# Top Genres # Concentration (HHI & Gini)
sorted_genres = sorted(genre_counts.items(), key=lambda x: x[1], reverse=True)[:5] # HHI: Sum of (share)^2
top_genres = [{"name": g, "count": c} for g, c in sorted_genres] shares = [c / total_plays for c in track_counts.values()]
hhi = sum([s ** 2 for s in shares])
# Concentration # Gini Coefficient (Inequality of play distribution)
unique_tracks_count = len(unique_track_ids) sorted_shares = sorted(shares)
repeat_rate = (total_plays - unique_tracks_count) / total_plays if total_plays > 0 else 0 n = len(shares)
if n > 0:
# HHI (HerfindahlHirschman Index) gini = (2 * sum((i + 1) * x for i, x in enumerate(sorted_shares))) / (n * sum(sorted_shares)) - (n + 1) / n
# Sum of (share)^2. Share = track_plays / total_plays else:
hhi = sum([(c/total_plays)**2 for c in track_play_counts.values()]) gini = 0
return { return {
"total_plays": total_plays, "total_plays": total_plays,
"estimated_minutes": int(estimated_minutes), "estimated_minutes": int(total_ms / 60000),
"unique_tracks": unique_tracks_count, "unique_tracks": unique_tracks,
"unique_artists": len(unique_artist_ids), "unique_artists": len(artist_counts),
"unique_albums": len(unique_album_ids), "unique_albums": len(album_ids),
"unique_genres": len(genre_counts), "unique_genres": len(genre_counts),
"top_tracks": top_tracks, "top_tracks": top_tracks,
"top_artists": top_artists, "top_artists": top_artists,
"top_genres": top_genres, "top_genres": top_genres,
"repeat_rate": round(repeat_rate, 3), "repeat_rate": round((total_plays - unique_tracks) / total_plays, 3) if total_plays else 0,
"one_and_done_rate": round(one_and_done / unique_tracks, 3) if unique_tracks else 0,
"concentration": { "concentration": {
"hhi": round(hhi, 4), "hhi": round(hhi, 4),
# "gini": ... (skip for now to keep it simple) "gini": round(gini, 4),
"top_1_share": round(max(shares), 3) if shares else 0
} }
} }
def compute_time_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: def compute_time_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
""" """
Hourly, Daily distribution, etc. Includes Part-of-Day buckets and Listening Streaks.
"""
query = self.db.query(PlayHistory).filter(
PlayHistory.played_at >= period_start,
PlayHistory.played_at <= period_end
)
plays = query.all()
hourly_counts = [0] * 24
weekday_counts = [0] * 7 # 0=Mon, 6=Sun
if not plays:
return {"hourly_distribution": hourly_counts}
for p in plays:
# played_at is UTC in DB usually. Ensure we handle timezone if user wants local.
# For now, assuming UTC or system time.
h = p.played_at.hour
d = p.played_at.weekday()
hourly_counts[h] += 1
weekday_counts[d] += 1
peak_hour = hourly_counts.index(max(hourly_counts))
# Weekend Share
weekend_plays = weekday_counts[5] + weekday_counts[6]
weekend_share = weekend_plays / len(plays) if len(plays) > 0 else 0
return {
"hourly_distribution": hourly_counts,
"peak_hour": peak_hour,
"weekday_distribution": weekday_counts,
"weekend_share": round(weekend_share, 2)
}
def compute_session_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
"""
Session logic: Gap > 20 mins = new session.
""" """
query = self.db.query(PlayHistory).filter( query = self.db.query(PlayHistory).filter(
PlayHistory.played_at >= period_start, PlayHistory.played_at >= period_start,
@@ -193,76 +177,160 @@ class StatsService:
plays = query.all() plays = query.all()
if not plays: if not plays:
return {"count": 0, "avg_length_minutes": 0} return {}
hourly_counts = [0] * 24
weekday_counts = [0] * 7
part_of_day = {"morning": 0, "afternoon": 0, "evening": 0, "night": 0}
# For Streaks
active_dates = set()
for p in plays:
h = p.played_at.hour
hourly_counts[h] += 1
weekday_counts[p.played_at.weekday()] += 1
active_dates.add(p.played_at.date())
if 5 <= h < 12:
part_of_day["morning"] += 1
elif 12 <= h < 17:
part_of_day["afternoon"] += 1
elif 17 <= h < 22:
part_of_day["evening"] += 1
else:
part_of_day["night"] += 1
# Calculate Streak
sorted_dates = sorted(list(active_dates))
current_streak = 0
longest_streak = 0
if sorted_dates:
current_streak = 1
longest_streak = 1
# Check strictly consecutive days
for i in range(1, len(sorted_dates)):
delta = (sorted_dates[i] - sorted_dates[i - 1]).days
if delta == 1:
current_streak += 1
else:
longest_streak = max(longest_streak, current_streak)
current_streak = 1
longest_streak = max(longest_streak, current_streak)
weekend_plays = weekday_counts[5] + weekday_counts[6]
return {
"hourly_distribution": hourly_counts,
"peak_hour": hourly_counts.index(max(hourly_counts)),
"weekday_distribution": weekday_counts,
"weekend_share": round(weekend_plays / len(plays), 2),
"part_of_day": part_of_day,
"listening_streak": current_streak,
"longest_streak": longest_streak,
"active_days": len(active_dates)
}
def compute_session_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
"""
Includes Micro-sessions, Marathon sessions, and Energy Arcs.
"""
# Need to join Track to get Energy features for Arc analysis
query = self.db.query(PlayHistory).options(joinedload(PlayHistory.track)).filter(
PlayHistory.played_at >= period_start,
PlayHistory.played_at <= period_end
).order_by(PlayHistory.played_at.asc())
plays = query.all()
if not plays:
return {"count": 0}
sessions = [] sessions = []
current_session = [plays[0]] current_session = [plays[0]]
# 1. Sessionization (Gap > 20 mins)
for i in range(1, len(plays)): for i in range(1, len(plays)):
prev = plays[i-1] diff = (plays[i].played_at - plays[i-1].played_at).total_seconds() / 60
curr = plays[i]
diff = (curr.played_at - prev.played_at).total_seconds() / 60
if diff > 20: if diff > 20:
sessions.append(current_session) sessions.append(current_session)
current_session = [] current_session = []
current_session.append(plays[i])
current_session.append(curr)
sessions.append(current_session) sessions.append(current_session)
session_lengths_min = [] # 2. Analyze Sessions
for sess in sessions: lengths_min = []
if len(sess) > 1: micro_sessions = 0
start = sess[0].played_at marathon_sessions = 0
end = sess[-1].played_at energy_arcs = {"rising": 0, "falling": 0, "flat": 0, "unknown": 0}
# Add duration of last track?
# Let's just do (end - start) for simplicity + avg track duration
duration = (end - start).total_seconds() / 60
session_lengths_min.append(duration)
else:
session_lengths_min.append(3.0) # Approx 1 track
avg_min = sum(session_lengths_min) / len(session_lengths_min) if session_lengths_min else 0 for sess in sessions:
# Durations
if len(sess) > 1:
duration = (sess[-1].played_at - sess[0].played_at).total_seconds() / 60
lengths_min.append(duration)
else:
lengths_min.append(3.0) # Approx
# Types
if len(sess) <= 3: micro_sessions += 1
if len(sess) >= 20: marathon_sessions += 1
# Energy Arc (First vs Last track)
first_t = sess[0].track
last_t = sess[-1].track
if first_t and last_t and first_t.energy is not None and last_t.energy is not None:
diff = last_t.energy - first_t.energy
if diff > 0.1: energy_arcs["rising"] += 1
elif diff < -0.1: energy_arcs["falling"] += 1
else: energy_arcs["flat"] += 1
else:
energy_arcs["unknown"] += 1
avg_min = sum(lengths_min) / len(lengths_min) if lengths_min else 0
return { return {
"count": len(sessions), "count": len(sessions),
"avg_tracks": len(plays) / len(sessions), "avg_tracks": round(len(plays) / len(sessions), 1),
"avg_minutes": round(avg_min, 1), "avg_minutes": round(avg_min, 1),
"longest_session_minutes": round(max(session_lengths_min), 1) if session_lengths_min else 0 "longest_session_minutes": round(max(lengths_min), 1) if lengths_min else 0,
"micro_session_rate": round(micro_sessions / len(sessions), 2),
"marathon_session_rate": round(marathon_sessions / len(sessions), 2),
"energy_arcs": energy_arcs
} }
def compute_vibe_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: def compute_vibe_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
""" """
Aggregates Audio Features (Energy, Valence, etc.) Aggregates Audio Features + Calculates Whiplash (Transitions)
""" """
query = self.db.query(PlayHistory).filter( # Fetch plays strictly ordered by time for transition analysis
plays = self.db.query(PlayHistory).filter(
PlayHistory.played_at >= period_start, PlayHistory.played_at >= period_start,
PlayHistory.played_at <= period_end PlayHistory.played_at <= period_end
) ).order_by(PlayHistory.played_at.asc()).all()
plays = query.all()
track_ids = list(set([p.track_id for p in plays]))
if not track_ids: if not plays:
return {} return {}
track_ids = list(set([p.track_id for p in plays]))
tracks = self.db.query(Track).filter(Track.id.in_(track_ids)).all() tracks = self.db.query(Track).filter(Track.id.in_(track_ids)).all()
track_map = {t.id: t for t in tracks}
# Collect features # 1. Aggregates
features = { features = {k: [] for k in
"energy": [], "valence": [], "danceability": [], ["energy", "valence", "danceability", "tempo", "acousticness", "instrumentalness", "liveness",
"tempo": [], "acousticness": [], "instrumentalness": [], "speechiness", "loudness"]}
"liveness": [], "speechiness": []
}
for t in tracks: # 2. Transition Arrays (for Whiplash)
# Weight by plays? The spec implies "Per-Period Aggregates". transitions = {"tempo": [], "energy": [], "valence": []}
# Usually weighted by play count is better representation of what was HEARD.
# Let's weight by play count in this period.
play_count = len([p for p in plays if p.track_id == t.id])
previous_track = None
for i, p in enumerate(plays):
t = track_map.get(p.track_id)
if not t: continue
# Populate aggregations
if t.energy is not None: if t.energy is not None:
for _ in range(play_count):
features["energy"].append(t.energy) features["energy"].append(t.energy)
features["valence"].append(t.valence) features["valence"].append(t.valence)
features["danceability"].append(t.danceability) features["danceability"].append(t.danceability)
@@ -271,7 +339,22 @@ class StatsService:
features["instrumentalness"].append(t.instrumentalness) features["instrumentalness"].append(t.instrumentalness)
features["liveness"].append(t.liveness) features["liveness"].append(t.liveness)
features["speechiness"].append(t.speechiness) features["speechiness"].append(t.speechiness)
features["loudness"].append(t.loudness)
# Calculate Transitions (Whiplash)
if i > 0 and previous_track:
# Only count transition if within reasonable time (e.g. < 5 mins gap)
# assuming continuous listening
time_diff = (p.played_at - plays[i - 1].played_at).total_seconds()
if time_diff < 300:
if t.tempo and previous_track.tempo:
transitions["tempo"].append(abs(t.tempo - previous_track.tempo))
if t.energy and previous_track.energy:
transitions["energy"].append(abs(t.energy - previous_track.energy))
previous_track = t
# Calculate Stats
stats = {} stats = {}
for key, values in features.items(): for key, values in features.items():
valid = [v for v in values if v is not None] valid = [v for v in values if v is not None]
@@ -282,46 +365,55 @@ class StatsService:
stats[f"avg_{key}"] = None stats[f"avg_{key}"] = None
# Derived Metrics # Derived Metrics
if stats.get("avg_energy") and stats.get("avg_valence"): if stats.get("avg_energy") is not None and stats.get("avg_valence") is not None:
stats["mood_quadrant"] = { stats["mood_quadrant"] = {
"x": round(stats["avg_valence"], 2), "x": round(stats["avg_valence"], 2),
"y": round(stats["avg_energy"], 2) "y": round(stats["avg_energy"], 2)
} }
# Consistency: Inverse of average standard deviation of Mood components
avg_std = (stats["std_energy"] + stats["std_valence"]) / 2
stats["consistency_score"] = round(1.0 - avg_std, 2) # Higher = more consistent
# Whiplash Scores (Average jump between tracks)
stats["whiplash"] = {}
for k in ["tempo", "energy"]:
if transitions[k]:
stats["whiplash"][k] = round(float(np.mean(transitions[k])), 2)
else:
stats["whiplash"][k] = 0
return stats return stats
def compute_era_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: def compute_era_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
""" """
Musical Age and Era Distribution. Includes Nostalgia Gap and granular decade breakdown.
""" """
query = self.db.query(PlayHistory).filter( # Join track to get raw_data
query = self.db.query(PlayHistory).options(joinedload(PlayHistory.track)).filter(
PlayHistory.played_at >= period_start, PlayHistory.played_at >= period_start,
PlayHistory.played_at <= period_end PlayHistory.played_at <= period_end
) )
plays = query.all() plays = query.all()
years = [] years = []
track_ids = list(set([p.track_id for p in plays]))
tracks = self.db.query(Track).filter(Track.id.in_(track_ids)).all()
track_map = {t.id: t for t in tracks}
for p in plays: for p in plays:
t = track_map.get(p.track_id) t = p.track
if t and t.raw_data and "album" in t.raw_data and "release_date" in t.raw_data["album"]: if t and t.raw_data and "album" in t.raw_data:
rd = t.raw_data["album"]["release_date"] rd = t.raw_data["album"].get("release_date")
# Format can be YYYY, YYYY-MM, YYYY-MM-DD if rd:
try: try:
year = int(rd.split("-")[0]) years.append(int(rd.split("-")[0]))
years.append(year)
except: except:
pass pass
if not years: if not years:
return {"musical_age": None} return {"musical_age": None}
# Musical Age (Weighted Average)
avg_year = sum(years) / len(years) avg_year = sum(years) / len(years)
current_year = datetime.utcnow().year
# Decade breakdown # Decade Distribution
decades = {} decades = {}
for y in years: for y in years:
dec = (y // 10) * 10 dec = (y // 10) * 10
@@ -329,11 +421,13 @@ class StatsService:
decades[label] = decades.get(label, 0) + 1 decades[label] = decades.get(label, 0) + 1
total = len(years) total = len(years)
decade_dist = {k: round(v/total, 2) for k, v in decades.items()} dist = {k: round(v / total, 3) for k, v in decades.items()}
return { return {
"musical_age": int(avg_year), "musical_age": int(avg_year),
"decade_distribution": decade_dist "nostalgia_gap": int(current_year - avg_year),
"freshness_score": dist.get(f"{int(current_year / 10) * 10}s", 0), # Share of current decade
"decade_distribution": dist
} }
def compute_skip_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: def compute_skip_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
@@ -381,16 +475,191 @@ class StatsService:
"skip_rate": round(skips / len(plays), 3) "skip_rate": round(skips / len(plays), 3)
} }
def generate_full_report(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]: def compute_context_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
"""
Analyzes context_uri to determine if user listens to Playlists, Albums, or Artists.
"""
query = self.db.query(PlayHistory).filter(
PlayHistory.played_at >= period_start,
PlayHistory.played_at <= period_end
)
plays = query.all()
if not plays:
return {}
context_counts = {"playlist": 0, "album": 0, "artist": 0, "collection": 0, "unknown": 0}
unique_contexts = {}
for p in plays:
if not p.context_uri:
context_counts["unknown"] += 1
continue
# Count distinct contexts for loyalty
unique_contexts[p.context_uri] = unique_contexts.get(p.context_uri, 0) + 1
if "playlist" in p.context_uri:
context_counts["playlist"] += 1
elif "album" in p.context_uri:
context_counts["album"] += 1
elif "artist" in p.context_uri:
context_counts["artist"] += 1
elif "collection" in p.context_uri:
# "Liked Songs" usually shows up as collection
context_counts["collection"] += 1
else:
context_counts["unknown"] += 1
total = len(plays)
breakdown = {k: round(v / total, 2) for k, v in context_counts.items()}
# Top 5 Contexts (Requires resolving URI to name, possibly missing metadata here)
sorted_contexts = sorted(unique_contexts.items(), key=lambda x: x[1], reverse=True)[:5]
return { return {
"period": { "type_breakdown": breakdown,
"start": period_start.isoformat(), "album_purist_score": breakdown.get("album", 0),
"end": period_end.isoformat() "playlist_dependency": breakdown.get("playlist", 0),
}, "context_loyalty": round(len(plays) / len(unique_contexts), 2) if unique_contexts else 0,
"top_context_uris": [{"uri": k, "count": v} for k, v in sorted_contexts]
}
def compute_taste_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
"""
Mainstream vs. Hipster analysis based on Track.popularity (0-100).
"""
query = self.db.query(PlayHistory).filter(
PlayHistory.played_at >= period_start,
PlayHistory.played_at <= period_end
)
plays = query.all()
if not plays: return {}
track_ids = list(set([p.track_id for p in plays]))
tracks = self.db.query(Track).filter(Track.id.in_(track_ids)).all()
track_map = {t.id: t for t in tracks}
pop_values = []
for p in plays:
t = track_map.get(p.track_id)
if t and t.popularity is not None:
pop_values.append(t.popularity)
if not pop_values:
return {"avg_popularity": 0, "hipster_score": 0}
avg_pop = float(np.mean(pop_values))
# Hipster Score: Percentage of tracks with popularity < 30
underground_plays = len([x for x in pop_values if x < 30])
mainstream_plays = len([x for x in pop_values if x > 70])
return {
"avg_popularity": round(avg_pop, 1),
"hipster_score": round((underground_plays / len(pop_values)) * 100, 1),
"mainstream_score": round((mainstream_plays / len(pop_values)) * 100, 1),
"obscurity_rating": round(100 - avg_pop, 1)
}
def compute_lifecycle_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
"""
Determines if tracks are 'New Discoveries' or 'Old Favorites'.
"""
# 1. Get tracks played in this period
current_plays = self.db.query(PlayHistory).filter(
PlayHistory.played_at >= period_start,
PlayHistory.played_at <= period_end
).all()
if not current_plays: return {}
current_track_ids = set([p.track_id for p in current_plays])
# 2. Check if these tracks were played BEFORE period_start
# We find which of the current_track_ids exist in history < period_start
old_tracks_query = self.db.query(distinct(PlayHistory.track_id)).filter(
PlayHistory.track_id.in_(current_track_ids),
PlayHistory.played_at < period_start
)
old_track_ids = set([r[0] for r in old_tracks_query.all()])
# 3. Calculate Discovery
new_discoveries = current_track_ids - old_track_ids
discovery_count = len(new_discoveries)
# Calculate plays on new discoveries
plays_on_new = len([p for p in current_plays if p.track_id in new_discoveries])
total_plays = len(current_plays)
return {
"discovery_count": discovery_count,
"discovery_rate": round(plays_on_new / total_plays, 3) if total_plays > 0 else 0,
"recurrence_rate": round((total_plays - plays_on_new) / total_plays, 3) if total_plays > 0 else 0
}
def compute_explicit_stats(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
"""
Analyzes explicit content consumption.
"""
query = self.db.query(PlayHistory).options(joinedload(PlayHistory.track)).filter(
PlayHistory.played_at >= period_start,
PlayHistory.played_at <= period_end
)
plays = query.all()
if not plays: return {"explicit_rate": 0, "hourly_explicit_rate": []}
total_plays = len(plays)
explicit_count = 0
hourly_explicit = [0] * 24
hourly_total = [0] * 24
for p in plays:
h = p.played_at.hour
hourly_total[h] += 1
# Check raw_data for explicit flag
t = p.track
is_explicit = False
if t.raw_data and t.raw_data.get("explicit"):
is_explicit = True
if is_explicit:
explicit_count += 1
hourly_explicit[h] += 1
# Calculate hourly percentages
hourly_rates = []
for i in range(24):
if hourly_total[i] > 0:
hourly_rates.append(round(hourly_explicit[i] / hourly_total[i], 2))
else:
hourly_rates.append(0.0)
return {
"explicit_rate": round(explicit_count / total_plays, 3),
"total_explicit_plays": explicit_count,
"hourly_explicit_distribution": hourly_rates
}
def generate_full_report(self, period_start: datetime, period_end: datetime) -> Dict[str, Any]:
# 1. Calculate all current stats
current_stats = {
"period": {"start": period_start.isoformat(), "end": period_end.isoformat()},
"volume": self.compute_volume_stats(period_start, period_end), "volume": self.compute_volume_stats(period_start, period_end),
"time_habits": self.compute_time_stats(period_start, period_end), "time_habits": self.compute_time_stats(period_start, period_end),
"sessions": self.compute_session_stats(period_start, period_end), "sessions": self.compute_session_stats(period_start, period_end),
"context": self.compute_context_stats(period_start, period_end),
"vibe": self.compute_vibe_stats(period_start, period_end), "vibe": self.compute_vibe_stats(period_start, period_end),
"era": self.compute_era_stats(period_start, period_end), "era": self.compute_era_stats(period_start, period_end),
"taste": self.compute_taste_stats(period_start, period_end),
"lifecycle": self.compute_lifecycle_stats(period_start, period_end),
"flags": self.compute_explicit_stats(period_start, period_end),
"skips": self.compute_skip_stats(period_start, period_end) "skips": self.compute_skip_stats(period_start, period_end)
} }
# 2. Calculate Comparison
current_stats["comparison"] = self.compute_comparison(current_stats, period_start, period_end)
return current_stats

16
backend/run_scheduler.py Normal file
View File

@@ -0,0 +1,16 @@
import schedule
import time
from run_analysis import run_analysis_pipeline
def job():
print("Running daily analysis...")
# Analyze last 24 hours
run_analysis_pipeline(days=1)
# Schedule for 03:00 AM
schedule.every().day.at("03:00").do(job)
print("Scheduler started...")
while True:
schedule.run_pending()
time.sleep(60)