MusicAnalyser/backend/app/ingest.py

import asyncio
import os
from datetime import datetime
from sqlalchemy.orm import Session
from .models import Track, PlayHistory, Artist
from .database import SessionLocal
from .services.spotify_client import SpotifyClient
from .services.reccobeats_client import ReccoBeatsClient
from dateutil import parser

# Initialize Spotify Client (env vars will be populated later)
def get_spotify_client():
    return SpotifyClient(
        client_id=os.getenv("SPOTIFY_CLIENT_ID"),
        client_secret=os.getenv("SPOTIFY_CLIENT_SECRET"),
        refresh_token=os.getenv("SPOTIFY_REFRESH_TOKEN"),
    )

def get_reccobeats_client():
    return ReccoBeatsClient()

async def ensure_artists_exist(db: Session, artists_data: list):
    """
    Ensures that all artists in the list exist in the Artist table.
    Returns a list of Artist objects.
    """
    artist_objects = []
    for a_data in artists_data:
        artist_id = a_data["id"]
        artist = db.query(Artist).filter(Artist.id == artist_id).first()
        if not artist:
            artist = Artist(
                id=artist_id,
                name=a_data["name"],
                genres=[] # Will be enriched later
            )
            db.add(artist)
            # We commit inside the loop or after, but for now we rely on the main commit
            # However, to return the object correctly we might need to flush if we were doing complex things,
            # but here adding to session is enough for SQLAlchemy to track it.
        artist_objects.append(artist)
    return artist_objects

async def enrich_tracks(db: Session, spotify_client: SpotifyClient, recco_client: ReccoBeatsClient):
    """
    Finds tracks missing genres (Spotify) or audio features (ReccoBeats) and enriches them.
    Also enriches Artists with genres.
    """

    # 1. Enrich Audio Features (via ReccoBeats)
    tracks_missing_features = db.query(Track).filter(Track.danceability == None).limit(50).all()
    print(f"DEBUG: Found {len(tracks_missing_features)} tracks missing audio features.")

    if tracks_missing_features:
        print(f"Enriching {len(tracks_missing_features)} tracks with audio features (ReccoBeats)...")
        ids = [t.id for t in tracks_missing_features]

        features_list = await recco_client.get_audio_features(ids)

        features_map = {}
        for f in features_list:
            tid = f.get("id")
            if not tid and "href" in f:
                if "tracks/" in f["href"]:
                    tid = f["href"].split("tracks/")[1].split("?")[0]
                elif "track/" in f["href"]:
                    tid = f["href"].split("track/")[1].split("?")[0]

            if tid:
                features_map[tid] = f

        updated_count = 0
        for track in tracks_missing_features:
            data = features_map.get(track.id)
            if data:
                track.danceability = data.get("danceability")
                track.energy = data.get("energy")
                track.key = data.get("key")
                track.loudness = data.get("loudness")
                track.mode = data.get("mode")
                track.speechiness = data.get("speechiness")
                track.acousticness = data.get("acousticness")
                track.instrumentalness = data.get("instrumentalness")
                track.liveness = data.get("liveness")
                track.valence = data.get("valence")
                track.tempo = data.get("tempo")
                updated_count += 1

        print(f"Updated {updated_count} tracks with audio features.")
        db.commit()

    # 2. Enrich Artist Genres (via Spotify Artists)
    # We look for artists who have no genres. Note: an artist might genuinely have no genres,
    # so we might need a flag "genres_checked" in the future, but for now checking empty list is okay.
    # However, newly created artists have genres=[] (empty list) or None?
    # My model definition: genres = Column(JSON, nullable=True)
    # So if it is None, we haven't fetched it.

    artists_missing_genres = db.query(Artist).filter(Artist.genres == None).limit(50).all()

    if artists_missing_genres:
        print(f"Enriching {len(artists_missing_genres)} artists with genres (Spotify)...")
        artist_ids_list = [a.id for a in artists_missing_genres]

        artist_data_map = {}
        # Spotify allows fetching 50 artists at a time
        for i in range(0, len(artist_ids_list), 50):
            chunk = artist_ids_list[i:i+50]
            artists_data = await spotify_client.get_artists(chunk)
            for a_data in artists_data:
                if a_data:
                    artist_data_map[a_data["id"]] = a_data.get("genres", [])

        for artist in artists_missing_genres:
            genres = artist_data_map.get(artist.id)
            if genres is not None:
                artist.genres = genres
            else:
                # If we couldn't fetch, set to empty list so we don't keep retrying forever (or handle errors better)
                artist.genres = []

        db.commit()


async def ingest_recently_played(db: Session):
    spotify_client = get_spotify_client()
    recco_client = get_reccobeats_client()

    try:
        items = await spotify_client.get_recently_played(limit=50)
    except Exception as e:
        print(f"Error connecting to Spotify: {e}")
        return

    print(f"Fetched {len(items)} items from Spotify.")

    for item in items:
        track_data = item["track"]
        played_at_str = item["played_at"]
        played_at = parser.isoparse(played_at_str)

        track_id = track_data["id"]
        track = db.query(Track).filter(Track.id == track_id).first()

        if not track:
            print(f"New track found: {track_data['name']}")
            track = Track(
                id=track_id,
                name=track_data["name"],
                artist=", ".join([a["name"] for a in track_data["artists"]]), # Legacy string
                album=track_data["album"]["name"],
                duration_ms=track_data["duration_ms"],
                popularity=track_data["popularity"],
                raw_data=track_data
            )

            # Handle Artists Relation
            artists_data = track_data.get("artists", [])
            artist_objects = await ensure_artists_exist(db, artists_data)
            track.artists = artist_objects

            db.add(track)
            db.commit()

        # Ensure relationships exist even if track existed (e.g. migration)
        # Check if track has artists linked. If not (and raw_data has them), link them.
        # FIX: Logic was previously indented improperly inside `if not track`.
        if not track.artists and track.raw_data and "artists" in track.raw_data:
             print(f"Backfilling artists for track {track.name}")
             artist_objects = await ensure_artists_exist(db, track.raw_data["artists"])
             track.artists = artist_objects
             db.commit()

        exists = db.query(PlayHistory).filter(
            PlayHistory.track_id == track_id,
            PlayHistory.played_at == played_at
        ).first()

        if not exists:
            print(f" recording play: {track_data['name']} at {played_at}")
            play = PlayHistory(
                track_id=track_id,
                played_at=played_at,
                context_uri=item.get("context", {}).get("uri") if item.get("context") else None
            )
            db.add(play)

    db.commit()

    # Enrich
    await enrich_tracks(db, spotify_client, recco_client)

async def run_worker():
    """Simulates a background worker loop."""
    db = SessionLocal()

    try:
        while True:
            print("Worker: Polling Spotify...")
            await ingest_recently_played(db)
            print("Worker: Sleeping for 60 seconds...")
            await asyncio.sleep(60)
    except Exception as e:
        print(f"Worker crashed: {e}")
    finally:
        db.close()