"""Base ingester class for session capture"""
from abc import ABC, abstractmethod
import logging
import re
from pathlib import Path
from typing import List, Optional

from ucts.core.models import Session, CodeBlock

# Configure logging
logger = logging.getLogger(__name__)


class IngestionError(Exception):
    """Base exception for ingestion errors"""
    pass


class FileNotFoundIngestionError(IngestionError):
    """Source file not found"""
    pass


class InvalidFormatError(IngestionError):
    """Invalid or unsupported file format"""
    pass


class ParseError(IngestionError):
    """Error parsing file contents"""
    pass


class SessionIngester(ABC):
    """Abstract base class for session ingesters"""

    @abstractmethod
    def ingest(self, source_path: str) -> Session:
        """
        Ingest a conversation from the given source.

        Args:
            source_path: Path to the source file

        Returns:
            Session object containing parsed conversation

        Raises:
            FileNotFoundIngestionError: If source file doesn't exist
            InvalidFormatError: If file format is not supported
            ParseError: If file contents cannot be parsed
        """
        pass

    def validate_source_path(self, source_path: str) -> Path:
        """
        Validate and return a Path object for the source.

        Args:
            source_path: Path to the source file

        Returns:
            Path object

        Raises:
            ValueError: If source_path is empty or None
            FileNotFoundIngestionError: If file doesn't exist
        """
        if not source_path:
            raise ValueError("source_path cannot be empty or None")

        path = Path(source_path)

        if not path.exists():
            raise FileNotFoundIngestionError(f"Source file not found: {source_path}")

        if not path.is_file():
            raise FileNotFoundIngestionError(f"Source path is not a file: {source_path}")

        return path

    def read_file_safe(self, path: Path, encoding: str = 'utf-8') -> str:
        """
        Safely read file contents with error handling.

        Args:
            path: Path to file
            encoding: File encoding (default: utf-8)

        Returns:
            File contents as string

        Raises:
            ParseError: If file cannot be read
        """
        try:
            return path.read_text(encoding=encoding)
        except UnicodeDecodeError:
            # Try with different encodings
            for alt_encoding in ['utf-8-sig', 'latin-1', 'cp1252']:
                try:
                    logger.debug(f"Retrying with encoding: {alt_encoding}")
                    return path.read_text(encoding=alt_encoding)
                except UnicodeDecodeError:
                    continue
            raise ParseError(f"Could not decode file with any supported encoding: {path}")
        except PermissionError as e:
            raise ParseError(f"Permission denied reading file: {e}")
        except OSError as e:
            raise ParseError(f"Error reading file: {e}")

    def extract_code_blocks(self, content: str) -> List[CodeBlock]:
        """Extract code blocks from markdown-formatted content"""
        if not content:
            return []

        blocks = []
        # Match code blocks with or without language specification
        pattern = r'```(\w+)?\n(.*?)```'
        matches = re.findall(pattern, content, re.DOTALL)

        for lang, code in matches:
            language = lang.strip().lower() if lang else 'text'
            code_stripped = code.strip()

            if not code_stripped:
                continue  # Skip empty code blocks

            blocks.append(CodeBlock(
                language=language,
                content=code_stripped,
                filename=self._infer_filename(code_stripped, language),
                purpose=self._classify_purpose(code_stripped, language)
            ))

        logger.debug(f"Extracted {len(blocks)} code blocks")
        return blocks

    def _infer_filename(self, code: str, language: str) -> str:
        """Infer filename from code content"""
        extensions = {
            'python': '.py',
            'py': '.py',
            'javascript': '.js',
            'js': '.js',
            'typescript': '.ts',
            'ts': '.ts',
            'tsx': '.tsx',
            'jsx': '.jsx',
            'rust': '.rs',
            'rs': '.rs',
            'go': '.go',
            'golang': '.go',
            'java': '.java',
            'c': '.c',
            'cpp': '.cpp',
            'c++': '.cpp',
            'csharp': '.cs',
            'cs': '.cs',
            'ruby': '.rb',
            'rb': '.rb',
            'php': '.php',
            'html': '.html',
            'css': '.css',
            'scss': '.scss',
            'sass': '.sass',
            'json': '.json',
            'yaml': '.yaml',
            'yml': '.yaml',
            'toml': '.toml',
            'sql': '.sql',
            'bash': '.sh',
            'shell': '.sh',
            'sh': '.sh',
            'powershell': '.ps1',
            'ps1': '.ps1',
            'dockerfile': 'Dockerfile',
            'makefile': 'Makefile',
            'markdown': '.md',
            'md': '.md',
        }

        # Try to find filename in comments
        patterns = [
            r'#\s*(?:file[:\s]+)?(\S+\.py)',  # Python
            r'//\s*(?:file[:\s]+)?(\S+\.[jt]sx?)',  # JS/TS
            r'<!--\s*(?:file[:\s]+)?(\S+\.html)',  # HTML
            r'/\*\s*(?:file[:\s]+)?(\S+\.\w+)',  # C-style
            r'#\s*(\S+\.sh)',  # Shell
        ]

        for pattern in patterns:
            match = re.search(pattern, code[:500])  # Only check first 500 chars
            if match:
                filename = match.group(1)
                # Basic validation
                if len(filename) < 100 and '/' not in filename and '\\' not in filename:
                    return filename

        # Generate default name
        ext = extensions.get(language.lower(), '.txt')
        return f"code{ext}"

    def _classify_purpose(self, code: str, language: str) -> str:
        """Classify the purpose of a code block"""
        if not code:
            return 'unknown'

        code_lower = code.lower()
        first_line = code_lower.split('\n')[0] if code_lower else ''

        # Check for test patterns
        test_patterns = ['test', 'assert', 'expect(', 'describe(', 'it(', 'pytest', 'unittest']
        if any(p in code_lower for p in test_patterns):
            return 'test'

        # Check for config patterns
        if language.lower() in ('json', 'yaml', 'yml', 'toml', 'ini', 'xml'):
            return 'config'

        # Check for example/demo patterns
        if 'example' in code_lower or 'demo' in code_lower or '# usage' in code_lower:
            return 'example'

        # Check for script patterns
        if first_line.startswith('#!') or 'if __name__' in code_lower:
            return 'script'

        return 'implementation'

    def extract_todos(self, content: str) -> List[str]:
        """Extract TODO items from content"""
        if not content:
            return []

        todos = []
        patterns = [
            r'TODO[:\s]+(.+?)(?:\n|$)',
            r'FIXME[:\s]+(.+?)(?:\n|$)',
            r'XXX[:\s]+(.+?)(?:\n|$)',
            r'HACK[:\s]+(.+?)(?:\n|$)',
            r'\[\s*\]\s+(.+?)(?:\n|$)',  # Markdown checkbox
            r'-\s*\[\s*\]\s+(.+?)(?:\n|$)',  # List markdown checkbox
        ]

        for pattern in patterns:
            matches = re.findall(pattern, content, re.IGNORECASE)
            for match in matches:
                cleaned = match.strip()
                if cleaned and len(cleaned) < 500:  # Reasonable length limit
                    todos.append(cleaned)

        # Remove duplicates while preserving order
        seen = set()
        unique_todos = []
        for todo in todos:
            if todo.lower() not in seen:
                seen.add(todo.lower())
                unique_todos.append(todo)

        logger.debug(f"Extracted {len(unique_todos)} TODOs")
        return unique_todos

    def extract_decisions(self, content: str) -> List[str]:
        """Extract decisions from content"""
        if not content:
            return []

        decisions = []
        patterns = [
            r'(?:we\s+)?decided\s+to\s+(.+?)(?:\.|$)',
            r'we\s+will\s+(.+?)(?:\.|$)',
            r'(?:we\'re\s+|we\s+are\s+)?going\s+to\s+(.+?)(?:\.|$)',
            r'decision[:\s]+(.+?)(?:\n|$)',
            r'chose\s+to\s+(.+?)(?:\.|$)',
            r'selected\s+(.+?)(?:\s+(?:for|as|because)|$)',
        ]

        for pattern in patterns:
            matches = re.findall(pattern, content, re.IGNORECASE)
            for match in matches:
                cleaned = match.strip()
                if cleaned and len(cleaned) < 500:  # Reasonable length limit
                    decisions.append(cleaned)

        # Remove duplicates
        seen = set()
        unique_decisions = []
        for decision in decisions:
            if decision.lower() not in seen:
                seen.add(decision.lower())
                unique_decisions.append(decision)

        logger.debug(f"Extracted {len(unique_decisions)} decisions")
        return unique_decisions
