#!/usr/bin/env python3
"""Ingest YouTube voice-capture audio from a Nextcloud folder.

MVP behavior:
- Poll a configured Nextcloud WebDAV folder.
- Download new audio files.
- Deduplicate exact source files by sha256 checksum.
- Transcribe with faster-whisper.
- Write raw transcript, JSON metadata, and Markdown staging note.
- Do not write to the user's live Obsidian vault.
"""

from __future__ import annotations

import argparse
import base64
import datetime as dt
import hashlib
import json
import os
import re
import sys
import tempfile
import urllib.error
import urllib.parse
import urllib.request
import xml.etree.ElementTree as ET
from pathlib import Path
from typing import Any

AUDIO_EXTENSIONS = {".mp3", ".m4a", ".wav", ".opus", ".ogg", ".webm", ".aac", ".flac"}


def utc_now() -> str:
    return dt.datetime.now(dt.timezone.utc).isoformat(timespec="seconds")


def slugify(value: str, max_len: int = 80) -> str:
    value = re.sub(r"[^A-Za-z0-9._-]+", "-", value).strip("-._")
    return (value or "audio")[:max_len]


class NextcloudClient:
    def __init__(self, base_url: str, username: str, password: str):
        self.base_url = base_url.rstrip("/")
        self.username = username
        token = base64.b64encode(f"{username}:{password}".encode()).decode()
        self.headers = {"Authorization": f"Basic {token}"}

    def dav_url(self, path: str) -> str:
        quoted_user = urllib.parse.quote(self.username, safe="")
        parts = [urllib.parse.quote(p, safe="") for p in path.strip("/").split("/") if p]
        suffix = "/".join(parts)
        return f"{self.base_url}/remote.php/dav/files/{quoted_user}/{suffix}"

    def request(self, method: str, url: str, data: bytes | None = None, headers: dict[str, str] | None = None) -> bytes:
        all_headers = dict(self.headers)
        if headers:
            all_headers.update(headers)
        req = urllib.request.Request(url, data=data, headers=all_headers, method=method)
        with urllib.request.urlopen(req, timeout=120) as resp:
            return resp.read()

    def list_folder(self, folder: str) -> list[dict[str, Any]]:
        body = b'''<?xml version="1.0" encoding="utf-8" ?>
<d:propfind xmlns:d="DAV:">
  <d:prop>
    <d:getetag/>
    <d:getcontentlength/>
    <d:getlastmodified/>
    <d:resourcetype/>
  </d:prop>
</d:propfind>'''
        data = self.request("PROPFIND", self.dav_url(folder), body, {"Depth": "1", "Content-Type": "application/xml"})
        root = ET.fromstring(data)
        ns = {"d": "DAV:"}
        items: list[dict[str, Any]] = []
        folder_url_path = urllib.parse.urlparse(self.dav_url(folder)).path.rstrip("/") + "/"
        for resp in root.findall("d:response", ns):
            href = resp.findtext("d:href", default="", namespaces=ns)
            href_path = urllib.parse.unquote(urllib.parse.urlparse(href).path)
            if href_path.rstrip("/") == folder_url_path.rstrip("/"):
                continue
            prop = resp.find("d:propstat/d:prop", ns)
            if prop is None:
                continue
            is_collection = prop.find("d:resourcetype/d:collection", ns) is not None
            if is_collection:
                continue
            name = Path(href_path).name
            items.append({
                "name": name,
                "href": href,
                "last_modified": prop.findtext("d:getlastmodified", default="", namespaces=ns),
                "etag": (prop.findtext("d:getetag", default="", namespaces=ns) or "").strip('"'),
                "size": int(prop.findtext("d:getcontentlength", default="0", namespaces=ns) or 0),
            })
        return items

    def download_href(self, href: str) -> bytes:
        if href.startswith("http://") or href.startswith("https://"):
            url = href
        else:
            url = self.base_url + href
        return self.request("GET", url)


def load_json(path: Path, default: Any) -> Any:
    if not path.exists():
        return default
    return json.loads(path.read_text())


def save_json_atomic(path: Path, data: Any) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    with tempfile.NamedTemporaryFile("w", dir=path.parent, delete=False) as tmp:
        json.dump(data, tmp, indent=2, sort_keys=True)
        tmp.write("\n")
        tmp_path = Path(tmp.name)
    tmp_path.replace(path)


def transcribe(audio_path: Path, model_name: str, model_cache_dir: Path) -> tuple[str, list[dict[str, Any]]]:
    from faster_whisper import WhisperModel  # imported lazily so --no-transcribe style failures are obvious

    model_cache_dir.mkdir(parents=True, exist_ok=True)
    model = WhisperModel(model_name, device="cpu", compute_type="int8", download_root=str(model_cache_dir))
    segments_iter, info = model.transcribe(str(audio_path), vad_filter=True)
    segments = []
    texts = []
    for seg in segments_iter:
        item = {"start": seg.start, "end": seg.end, "text": seg.text.strip()}
        segments.append(item)
        if item["text"]:
            texts.append(item["text"])
    return " ".join(texts).strip(), segments



def parse_http_datetime(value: str) -> dt.datetime | None:
    if not value:
        return None
    try:
        from email.utils import parsedate_to_datetime
        parsed = parsedate_to_datetime(value)
        if parsed.tzinfo is None:
            parsed = parsed.replace(tzinfo=dt.timezone.utc)
        return parsed.astimezone(dt.timezone.utc)
    except Exception:
        return None


def parse_iso_datetime(value: str) -> dt.datetime | None:
    if not value:
        return None
    try:
        return dt.datetime.fromisoformat(value.replace("Z", "+00:00")).astimezone(dt.timezone.utc)
    except Exception:
        return None


def record_datetime(record: dict[str, Any]) -> dt.datetime:
    return (
        parse_http_datetime(record.get("source_last_modified", ""))
        or parse_iso_datetime(record.get("ingested_at", ""))
        or dt.datetime.now(dt.timezone.utc)
    )


def record_day(record: dict[str, Any]) -> str:
    return record_datetime(record).date().isoformat()


def load_records(records_dir: Path) -> list[dict[str, Any]]:
    records_by_sha: dict[str, dict[str, Any]] = {}
    records_without_sha: list[dict[str, Any]] = []
    for path in sorted(records_dir.glob("*.json")):
        try:
            record = json.loads(path.read_text())
            sha = record.get("sha256")
            if sha:
                # Exact duplicate source files are processing artifacts, not separate human events.
                # Keep the latest record for daily-note display. Distinct recordings of the
                # same video will have distinct hashes and remain separate events.
                existing = records_by_sha.get(sha)
                if existing is None or record_datetime(record) >= record_datetime(existing):
                    records_by_sha[sha] = record
            else:
                records_without_sha.append(record)
        except Exception as exc:
            print(f"failed to load record {path}: {exc}", file=sys.stderr)
    return sorted([*records_by_sha.values(), *records_without_sha], key=record_datetime)


def read_transcript(record: dict[str, Any], max_chars: int = 1200) -> str:
    path = Path(record.get("transcript_path", ""))
    if not path.exists():
        return ""
    text = path.read_text(errors="replace").strip()
    if len(text) > max_chars:
        return text[:max_chars].rstrip() + "…"
    return text


def generate_daily_notes(records_dir: Path, daily_notes_dir: Path) -> None:
    records = load_records(records_dir)
    by_day: dict[str, list[dict[str, Any]]] = {}
    for record in records:
        by_day.setdefault(record_day(record), []).append(record)

    daily_notes_dir.mkdir(parents=True, exist_ok=True)
    for day, day_records in sorted(by_day.items()):
        day_records.sort(key=record_datetime)
        lines: list[str] = []
        lines.extend([
            "---",
            "created_by: assistant",
            "review_status: unreviewed",
            "source_type: youtube_daily_staging_note",
            "assistant_generated: true",
            "contains_user_words: true",
            "contains_assistant_words: true",
            f"youtube_day: {day}",
            "obsidian_import_status: staging_only",
            "---",
            "",
            f"# YouTube daily note — {day}",
            "",
            "> Staging note generated outside the live Obsidian vault. Raw audio, full transcripts, logs, and JSON processing records remain in server-side staging.",
            "",
            "## Watched videos",
            "",
            "TODO: YouTube activity/history import is not implemented yet. Future entries should include non-Short videos watched this day with title, author/channel, URL, thumbnail, captions/transcript status, and assistant summary.",
            "",
            "Shorts policy: ignore Shorts by default unless explicitly discussed or requested.",
            "",
            "## Voice captures",
            "",
        ])
        if not day_records:
            lines.append("No voice captures.")
        for record in day_records:
            when = record_datetime(record).strftime("%H:%M:%S UTC")
            status = record.get("status", "unknown")
            transcript = read_transcript(record)
            lines.extend([
                f"### {when} — {record.get('source_name', record.get('capture_id', 'capture'))}",
                "",
                f"- Capture ID: `{record.get('capture_id', '')}`",
                f"- Status: `{status}`",
                f"- Source audio: `{record.get('raw_audio_path', '')}`",
                f"- Staging record: `{record.get('record_path', '')}`",
                "",
                "#### User capture",
                "",
            ])
            if transcript:
                lines.extend(["```text", transcript, "```", ""])
            elif status == "transcription_failed":
                lines.extend([f"Transcription failed: `{record.get('error', 'unknown error')}`", ""])
            else:
                lines.extend(["No transcript text available.", ""])
            lines.extend([
                "#### Candidate video match",
                "",
                "TODO: unmatched. Later matching should use capture time, transcript clues, and YouTube activity.",
                "",
            ])
        lines.extend([
            "## Follow-up",
            "",
            "- Review unmatched captures.",
            "- Import YouTube history/activity for this date when available.",
            "- Promote selected items to Obsidian only after review.",
            "",
        ])
        (daily_notes_dir / f"{day}.md").write_text("\n".join(lines))


def write_markdown(path: Path, record: dict[str, Any], transcript: str) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    title = record["source_name"]
    content = f"""---
created_by: assistant
review_status: unreviewed
source_type: youtube_voice_capture
assistant_generated: true
contains_user_words: true
contains_assistant_words: false
capture_id: {record['capture_id']}
source_name: {json.dumps(record['source_name'])}
source_sha256: {record['sha256']}
ingested_at: {record['ingested_at']}
source_last_modified: {json.dumps(record.get('source_last_modified', ''))}
---

# Voice capture: {title}

## User capture

Verbatim Whisper transcript of the user's audio memo. This may contain transcription errors.

```text
{transcript}
```

## Source material

- Original audio: `{record['raw_audio_path']}`
- Raw transcript: `{record['transcript_path']}`
- Source Nextcloud file: `{record['source_name']}`
- Source size: {record.get('source_size', 0)} bytes

## Assistant summary

TODO: not generated in MVP.

## Candidate video match

TODO: not matched in MVP.

## Follow-up questions

- Which YouTube video did this refer to?
- Should this become a per-video note, daily capture entry, or both?
"""
    path.write_text(content)


def main() -> int:
    parser = argparse.ArgumentParser()
    parser.add_argument("--config", required=True)
    args = parser.parse_args()

    config_path = Path(args.config)
    config = json.loads(config_path.read_text())
    password = os.environ.get("NEXTCLOUD_APP_PASSWORD")
    if not password:
        print("NEXTCLOUD_APP_PASSWORD is not set", file=sys.stderr)
        return 2

    state_path = Path(config["state_path"])
    state = load_json(state_path, {"processed_sha256": {}, "processed_hrefs": {}})

    raw_dir = Path(config["raw_audio_dir"])
    transcript_dir = Path(config["transcript_dir"])
    records_dir = Path(config["records_dir"])
    notes_dir = Path(config["notes_staging_dir"])
    daily_notes_dir = Path(config.get("daily_notes_dir", str(notes_dir / "daily")))
    model_cache_dir = Path(config.get("model_cache_dir", "/var/lib/youtube-capture-ingest/models"))
    for d in (raw_dir, transcript_dir, records_dir, notes_dir, daily_notes_dir):
        d.mkdir(parents=True, exist_ok=True)

    client = NextcloudClient(config["nextcloud_base_url"], config["username"], password)
    audio_exts = {e.lower() for e in config.get("audio_extensions", sorted(AUDIO_EXTENSIONS))}
    folder = config["remote_folder"]
    model_name = config.get("whisper_model", "tiny")

    items = client.list_folder(folder)
    processed = 0
    skipped = 0
    for item in items:
        suffix = Path(item["name"]).suffix.lower()
        if suffix not in audio_exts:
            skipped += 1
            continue
        data = client.download_href(item["href"])
        sha = hashlib.sha256(data).hexdigest()
        if sha in state.get("processed_sha256", {}):
            skipped += 1
            continue
        capture_id = f"{dt.datetime.now(dt.timezone.utc).strftime('%Y%m%dT%H%M%SZ')}-{sha[:12]}"
        safe_name = slugify(item["name"])
        raw_path = raw_dir / f"{capture_id}-{safe_name}"
        raw_path.write_bytes(data)

        transcript_path = transcript_dir / f"{capture_id}.txt"
        record_path = records_dir / f"{capture_id}.json"
        note_path = notes_dir / f"{capture_id}.md"
        record = {
            "capture_id": capture_id,
            "source_name": item["name"],
            "source_href": item["href"],
            "source_last_modified": item.get("last_modified", ""),
            "source_etag": item.get("etag", ""),
            "source_size": item.get("size", 0),
            "sha256": sha,
            "ingested_at": utc_now(),
            "raw_audio_path": str(raw_path),
            "transcript_path": str(transcript_path),
            "record_path": str(record_path),
            "staging_note_path": str(note_path),
        }
        try:
            transcript, segments = transcribe(raw_path, model_name, model_cache_dir)
            transcript_path.write_text(transcript + "\n")
            record.update({"segments": segments, "status": "transcribed_unmatched"})
            write_markdown(note_path, record, transcript)
            processed += 1
        except Exception as exc:  # keep the batch moving; preserve failed artifact for review
            transcript = ""
            record.update({
                "segments": [],
                "status": "transcription_failed",
                "error": f"{type(exc).__name__}: {exc}",
            })
            transcript_path.write_text("")
            write_markdown(note_path, record, "TRANSCRIPTION FAILED: " + record["error"])
            print(f"failed to transcribe {item['name']}: {record['error']}", file=sys.stderr)

        save_json_atomic(record_path, record)
        state.setdefault("processed_sha256", {})[sha] = record
        state.setdefault("processed_hrefs", {})[item["href"]] = sha
        # Save after every file so a later bad file does not cause successful work to be forgotten.
        save_json_atomic(state_path, state)

    generate_daily_notes(records_dir, daily_notes_dir)
    state["last_run_at"] = utc_now()
    state["last_seen_count"] = len(items)
    save_json_atomic(state_path, state)
    print(f"youtube-capture-ingest: processed={processed} skipped={skipped} seen={len(items)} daily_notes_dir={daily_notes_dir}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())
