"""Web chat export ingester (ChatGPT, Claude.ai, etc.)"""
import json
import logging
from pathlib import Path
from typing import Dict, List, Any

from ucts.core.models import Session, Message
from ucts.ingestion.base import SessionIngester, ParseError, InvalidFormatError

# Configure logging
logger = logging.getLogger(__name__)


class WebChatIngester(SessionIngester):
    """Ingest web chat exports from Claude.ai, ChatGPT, etc."""

    def ingest(self, source_path: str) -> Session:
        """
        Parse web chat JSON exports.

        Handles:
        - Claude.ai exports
        - ChatGPT exports
        - Other web chat formats

        Raises:
            FileNotFoundIngestionError: If source file doesn't exist
            ParseError: If JSON cannot be parsed
            InvalidFormatError: If format is not recognized
        """
        path = self.validate_source_path(source_path)
        logger.info(f"Ingesting web chat export: {path}")

        # Read and parse JSON
        try:
            content = self.read_file_safe(path)
            data = json.loads(content)
        except json.JSONDecodeError as e:
            raise ParseError(f"Invalid JSON in web chat export: {e}")

        # Detect format and parse
        if self._is_chatgpt_format(data):
            logger.debug("Detected ChatGPT format")
            return self._parse_chatgpt(data, path)
        elif self._is_claude_web_format(data):
            logger.debug("Detected Claude.ai format")
            return self._parse_claude_web(data, path)
        else:
            logger.debug("Using generic JSON format")
            return self._parse_generic(data, path)

    def _is_chatgpt_format(self, data: Any) -> bool:
        """Detect ChatGPT export format"""
        if isinstance(data, dict):
            return 'mapping' in data or 'conversations' in data
        return False

    def _is_claude_web_format(self, data: Any) -> bool:
        """Detect Claude.ai export format"""
        if isinstance(data, dict):
            return 'chat_messages' in data or ('uuid' in data and 'name' in data)
        return False

    def _parse_chatgpt(self, data: Dict, path: Path) -> Session:
        """Parse ChatGPT export format"""
        messages: List[Message] = []
        all_content = ""

        # ChatGPT exports have a nested mapping structure
        if 'mapping' in data:
            mapping = data['mapping']
            if not isinstance(mapping, dict):
                logger.warning("ChatGPT mapping is not a dictionary")
                mapping = {}

            # Sort by create_time to maintain order
            sorted_nodes = sorted(
                mapping.items(),
                key=lambda x: x[1].get('message', {}).get('create_time', 0) or 0
            )

            for node_id, node in sorted_nodes:
                if not isinstance(node, dict):
                    continue

                msg_data = node.get('message')
                if not msg_data or not isinstance(msg_data, dict):
                    continue

                content_data = msg_data.get('content')
                if not content_data:
                    continue

                # Extract content
                if isinstance(content_data, dict):
                    parts = content_data.get('parts', [])
                    content = '\n'.join(str(p) for p in parts if p)
                elif isinstance(content_data, str):
                    content = content_data
                else:
                    content = str(content_data)

                if not content.strip():
                    continue

                # Get role
                author = msg_data.get('author', {})
                if isinstance(author, dict):
                    role = author.get('role', 'unknown')
                else:
                    role = 'unknown'

                if role == 'system':
                    continue  # Skip system messages

                role = self._normalize_role(role)

                # Build metadata
                metadata: Dict[str, Any] = {}
                msg_metadata = msg_data.get('metadata', {})
                if isinstance(msg_metadata, dict):
                    if msg_metadata.get('model_slug'):
                        metadata['model'] = msg_metadata['model_slug']

                messages.append(Message(
                    role=role,
                    content=content,
                    timestamp=str(msg_data.get('create_time', '')),
                    metadata=metadata
                ))
                all_content += content + "\n"

        code_blocks = self.extract_code_blocks(all_content)
        todos = self.extract_todos(all_content)
        decisions = self.extract_decisions(all_content)

        logger.info(
            f"Ingested {len(messages)} messages, "
            f"{len(code_blocks)} code blocks from ChatGPT export"
        )

        return Session(
            source="web",
            messages=messages,
            code_blocks=code_blocks,
            files_created=[],
            decisions=decisions,
            todos=todos,
            metadata={
                "source_file": str(path),
                "format": "chatgpt",
                "title": data.get('title', ''),
            }
        )

    def _parse_claude_web(self, data: Dict, path: Path) -> Session:
        """Parse Claude.ai web export format"""
        messages: List[Message] = []
        all_content = ""

        chat_messages = data.get('chat_messages', [])
        if not isinstance(chat_messages, list):
            logger.warning("chat_messages is not a list")
            chat_messages = []

        for idx, msg in enumerate(chat_messages):
            if not isinstance(msg, dict):
                logger.warning(f"Skipping non-dict message at index {idx}")
                continue

            role = msg.get('sender', 'unknown')
            role = self._normalize_role(role)

            content = msg.get('text', '')
            if not isinstance(content, str):
                content = str(content) if content else ''

            messages.append(Message(
                role=role,
                content=content,
                timestamp=msg.get('created_at', ''),
                metadata={}
            ))
            all_content += content + "\n"

        code_blocks = self.extract_code_blocks(all_content)
        todos = self.extract_todos(all_content)
        decisions = self.extract_decisions(all_content)

        logger.info(
            f"Ingested {len(messages)} messages, "
            f"{len(code_blocks)} code blocks from Claude.ai export"
        )

        return Session(
            source="web",
            messages=messages,
            code_blocks=code_blocks,
            files_created=[],
            decisions=decisions,
            todos=todos,
            metadata={
                "source_file": str(path),
                "format": "claude_web",
                "uuid": data.get('uuid', ''),
                "name": data.get('name', ''),
            }
        )

    def _parse_generic(self, data: Any, path: Path) -> Session:
        """Parse generic JSON chat format"""
        messages: List[Message] = []
        all_content = ""

        if isinstance(data, list):
            raw_messages = data
        elif isinstance(data, dict):
            raw_messages = data.get('messages', data.get('conversation', []))
            if not isinstance(raw_messages, list):
                raw_messages = []
        else:
            raw_messages = []

        if not raw_messages:
            logger.warning("No messages found in generic format")

        for idx, msg in enumerate(raw_messages):
            if not isinstance(msg, dict):
                logger.warning(f"Skipping non-dict message at index {idx}")
                continue

            role = msg.get('role', msg.get('sender', 'unknown'))
            role = self._normalize_role(role)

            content = msg.get('content', msg.get('text', msg.get('message', '')))
            if not isinstance(content, str):
                content = str(content) if content else ''

            messages.append(Message(
                role=role,
                content=content,
                timestamp=msg.get('timestamp', ''),
                metadata={}
            ))
            all_content += content + "\n"

        code_blocks = self.extract_code_blocks(all_content)
        todos = self.extract_todos(all_content)
        decisions = self.extract_decisions(all_content)

        logger.info(
            f"Ingested {len(messages)} messages, "
            f"{len(code_blocks)} code blocks from generic export"
        )

        return Session(
            source="web",
            messages=messages,
            code_blocks=code_blocks,
            files_created=[],
            decisions=decisions,
            todos=todos,
            metadata={
                "source_file": str(path),
                "format": "generic_json",
            }
        )

    def _normalize_role(self, role: str) -> str:
        """Normalize role names to standard format"""
        role_lower = role.lower() if role else 'unknown'

        if role_lower in ('user', 'human'):
            return 'user'
        elif role_lower in ('assistant', 'bot', 'ai', 'claude', 'chatgpt'):
            return 'assistant'
        elif role_lower == 'system':
            return 'system'
        else:
            return role_lower
