"""
UCTS Analysis Engine - Extract project structure from conversations
"""
import re
from typing import Dict, List, Set, Optional
from collections import Counter

from ucts.core.models import Session, CodeBlock, ProjectStructure


class AnalysisEngine:
    """Analyze conversations and extract project structure"""

    # Standard library modules to exclude from dependencies (Python 3.9-3.12+)
    PYTHON_STDLIB = {
        # Core
        'os', 'sys', 'json', 're', 'datetime', 'time', 'math', 'random',
        'collections', 'itertools', 'functools', 'typing', 'pathlib',
        'subprocess', 'threading', 'multiprocessing', 'logging', 'unittest',
        'dataclasses', 'abc', 'contextlib', 'copy', 'io', 'tempfile',
        'shutil', 'glob', 'fnmatch', 'pickle', 'sqlite3', 'csv', 'hashlib',
        'base64', 'uuid', 'argparse', 'configparser', 'enum', 'warnings',
        # Additional commonly used
        'inspect', 'traceback', 'types', 'weakref', 'operator', 'heapq',
        'bisect', 'array', 'decimal', 'fractions', 'statistics', 'struct',
        'codecs', 'string', 'textwrap', 'difflib', 'html', 'xml', 'urllib',
        'http', 'email', 'socket', 'ssl', 'select', 'signal', 'platform',
        'ctypes', 'importlib', 'pkgutil', 'zipfile', 'tarfile', 'zlib',
        'gzip', 'bz2', 'lzma', 'pprint', 'secrets', 'getpass', 'curses',
        'asyncio', 'concurrent', 'queue', 'sched', 'contextvars',
        # Python 3.11+
        'tomllib',
        # Python 3.12+
        'wsgiref', 'cgi', 'cgitb',
    }

    # Node.js built-in modules (Node 18+)
    NODE_BUILTIN = {
        'fs', 'path', 'http', 'https', 'crypto', 'os', 'util', 'events',
        'stream', 'buffer', 'url', 'querystring', 'child_process', 'cluster',
        'net', 'dns', 'readline', 'assert', 'console', 'process', 'tty',
        'vm', 'v8', 'zlib', 'worker_threads', 'perf_hooks', 'async_hooks',
        'inspector', 'trace_events', 'repl', 'module', 'timers', 'string_decoder',
        'punycode', 'domain', 'constants', 'tls', 'dgram', 'http2', 'wasi',
        'diagnostics_channel', 'test',
    }

    # Go standard library packages (common ones)
    GO_STDLIB = {
        'fmt', 'io', 'os', 'net', 'http', 'json', 'encoding', 'strings',
        'strconv', 'bytes', 'bufio', 'sort', 'sync', 'time', 'context',
        'errors', 'log', 'path', 'filepath', 'regexp', 'math', 'crypto',
        'testing', 'reflect', 'runtime', 'unsafe', 'syscall', 'flag',
        'database', 'sql', 'html', 'template', 'text', 'archive', 'compress',
        'container', 'debug', 'embed', 'go', 'hash', 'image', 'index', 'mime',
        'plugin', 'unicode', 'expvar', 'net/http', 'io/ioutil', 'io/fs',
    }

    # Java standard library packages (to exclude)
    JAVA_STDLIB = {
        'java', 'javax', 'sun', 'com.sun', 'jdk', 'org.w3c', 'org.xml',
        'org.omg', 'org.ietf',
    }

    # Language aliases for normalization
    LANGUAGE_ALIASES = {
        'py': 'python',
        'python3': 'python',
        'python2': 'python',
        'js': 'javascript',
        'node': 'javascript',
        'nodejs': 'javascript',
        'ts': 'typescript',
        'tsx': 'typescript',
        'jsx': 'javascript',
        'rs': 'rust',
        'rb': 'ruby',
        'cs': 'csharp',
        'c#': 'csharp',
        'c++': 'cpp',
        'cc': 'cpp',
        'cxx': 'cpp',
        'h': 'c',
        'hpp': 'cpp',
        'hxx': 'cpp',
        'kt': 'kotlin',
        'kts': 'kotlin',
        'swift': 'swift',
        'sh': 'bash',
        'zsh': 'bash',
        'fish': 'bash',
        'ps1': 'powershell',
        'psm1': 'powershell',
        'yml': 'yaml',
        'md': 'markdown',
        'dockerfile': 'docker',
        'makefile': 'make',
        'mk': 'make',
    }

    def analyze(self, session: Session) -> ProjectStructure:
        """Extract project structure from conversation"""
        code_blocks = self._extract_all_code_blocks(session)
        languages = self._detect_languages(code_blocks)
        files = self._infer_files(code_blocks, session)
        dependencies = self._extract_dependencies(code_blocks, languages)
        readme = self._generate_readme(session, languages, code_blocks)

        return ProjectStructure(
            name=self._infer_project_name(session),
            description=self._summarize_session(session),
            languages=languages,
            dependencies=dependencies,
            files=files,
            directories=self._infer_directories(files),
            readme_content=readme,
            todos=session.todos,
            metadata={
                'source': session.source,
                'message_count': len(session.messages),
                'code_block_count': len(code_blocks),
            }
        )

    def _extract_all_code_blocks(self, session: Session) -> List[CodeBlock]:
        """Extract code blocks from all messages"""
        blocks = []

        for msg in session.messages:
            if msg.role == "assistant":
                # Find ```language ... ``` blocks
                pattern = r'```(\w+)?\n(.*?)```'
                matches = re.findall(pattern, msg.content, re.DOTALL)

                for lang, content in matches:
                    lang = lang.strip() if lang else 'text'
                    blocks.append(CodeBlock(
                        language=lang,
                        content=content.strip(),
                        filename=self._infer_filename(content, lang),
                        purpose=self._classify_purpose(content, lang)
                    ))

        # Also include pre-extracted blocks
        blocks.extend(session.code_blocks)

        return blocks

    def _normalize_language(self, lang: str) -> str:
        """Normalize language name to canonical form"""
        if not lang:
            return 'text'
        lang_lower = lang.lower().strip()
        return self.LANGUAGE_ALIASES.get(lang_lower, lang_lower)

    def _detect_languages(self, blocks: List[CodeBlock]) -> List[str]:
        """Detect programming languages used"""
        skip_langs = {'text', 'plaintext', 'output', 'console', 'log', 'diff'}
        lang_counts = Counter(
            self._normalize_language(b.language) for b in blocks
            if b.language and self._normalize_language(b.language) not in skip_langs
        )
        return [lang for lang, _ in lang_counts.most_common()]

    def _infer_filename(self, code: str, language: str) -> Optional[str]:
        """Infer filename from code content"""
        # Normalize the language first
        language = self._normalize_language(language)

        extensions = {
            # Scripting languages
            'python': '.py',
            'javascript': '.js',
            'typescript': '.ts',
            'ruby': '.rb',
            'php': '.php',
            'perl': '.pl',
            'lua': '.lua',
            'r': '.R',
            # Systems languages
            'rust': '.rs',
            'go': '.go',
            'c': '.c',
            'cpp': '.cpp',
            'csharp': '.cs',
            # JVM languages
            'java': '.java',
            'kotlin': '.kt',
            'scala': '.scala',
            'groovy': '.groovy',
            'clojure': '.clj',
            # Mobile
            'swift': '.swift',
            'objective-c': '.m',
            'dart': '.dart',
            # Web
            'html': '.html',
            'css': '.css',
            'scss': '.scss',
            'sass': '.sass',
            'less': '.less',
            'vue': '.vue',
            'svelte': '.svelte',
            # Config/Data
            'json': '.json',
            'yaml': '.yaml',
            'toml': '.toml',
            'xml': '.xml',
            'ini': '.ini',
            'env': '.env',
            # Database
            'sql': '.sql',
            'graphql': '.graphql',
            'prisma': '.prisma',
            # Shell
            'bash': '.sh',
            'powershell': '.ps1',
            'batch': '.bat',
            'fish': '.fish',
            # Build/Config files
            'docker': 'Dockerfile',
            'make': 'Makefile',
            'cmake': 'CMakeLists.txt',
            'gradle': '.gradle',
            # Documentation
            'markdown': '.md',
            'rst': '.rst',
            'tex': '.tex',
            'latex': '.tex',
            # Other
            'elixir': '.ex',
            'erlang': '.erl',
            'haskell': '.hs',
            'ocaml': '.ml',
            'fsharp': '.fs',
            'zig': '.zig',
            'nim': '.nim',
            'v': '.v',
            'solidity': '.sol',
            'proto': '.proto',
            'protobuf': '.proto',
        }

        # Look for filename hints in code
        patterns = [
            r'#\s*file:\s*(\S+)',          # # file: name.py
            r'//\s*file:\s*(\S+)',         # // file: name.ts
            r'#\s*(\w+\.py)',              # Python files
            r'//\s*(\w+\.[jt]sx?)',        # JS/TS files
            r'<!--\s*(\S+\.html)',         # HTML files
            r'class\s+(\w+)',              # Class name -> filename
            r'def\s+(\w+)\s*\(',           # Function name (for single-function files)
        ]

        for pattern in patterns:
            match = re.search(pattern, code[:500])  # Only check first 500 chars
            if match:
                found = match.group(1)
                if '.' in found:
                    return found
                elif language in extensions:
                    return f"{found.lower()}{extensions[language]}"

        return None

    def _classify_purpose(self, code: str, language: str) -> str:
        """Classify the purpose of a code block"""
        code_lower = code.lower()

        if any(kw in code_lower for kw in ['test', 'assert', 'expect', 'describe', 'it(']):
            return 'test'
        elif language in ('json', 'yaml', 'yml', 'toml', 'ini', 'env'):
            return 'config'
        elif 'example' in code_lower or 'demo' in code_lower:
            return 'example'
        elif language in ('bash', 'shell', 'sh'):
            return 'script'
        elif language in ('dockerfile', 'makefile'):
            return 'build'
        else:
            return 'implementation'

    def _infer_files(self, blocks: List[CodeBlock], session: Session) -> Dict[str, str]:
        """Infer file structure from code blocks"""
        files: Dict[str, str] = {}
        unnamed_counts: Dict[str, int] = {}

        for block in blocks:
            if block.purpose == 'example':
                continue  # Skip examples

            filename = block.filename
            if not filename:
                # Generate a name
                ext = self._get_extension(block.language)
                count = unnamed_counts.get(block.language, 0)
                unnamed_counts[block.language] = count + 1

                if count == 0:
                    filename = f"main{ext}"
                else:
                    filename = f"module_{count}{ext}"

            # Add to appropriate directory
            if block.purpose == 'test':
                filename = f"tests/{filename}"
            elif block.purpose == 'config':
                pass  # Keep at root
            elif block.language == 'python':
                if not filename.startswith('src/'):
                    filename = f"src/{filename}"

            # Avoid duplicates
            if filename in files:
                base, ext = filename.rsplit('.', 1) if '.' in filename else (filename, '')
                counter = 1
                while f"{base}_{counter}.{ext}" in files:
                    counter += 1
                filename = f"{base}_{counter}.{ext}"

            files[filename] = block.content

        return files

    def _get_extension(self, language: str) -> str:
        """Get file extension for language"""
        # Normalize the language first
        language = self._normalize_language(language)

        extensions = {
            # Scripting languages
            'python': '.py',
            'javascript': '.js',
            'typescript': '.ts',
            'ruby': '.rb',
            'php': '.php',
            'perl': '.pl',
            'lua': '.lua',
            'r': '.R',
            # Systems languages
            'rust': '.rs',
            'go': '.go',
            'c': '.c',
            'cpp': '.cpp',
            'csharp': '.cs',
            # JVM languages
            'java': '.java',
            'kotlin': '.kt',
            'scala': '.scala',
            'groovy': '.groovy',
            # Mobile
            'swift': '.swift',
            'dart': '.dart',
            # Web
            'html': '.html',
            'css': '.css',
            'scss': '.scss',
            'vue': '.vue',
            'svelte': '.svelte',
            # Config
            'json': '.json',
            'yaml': '.yaml',
            'toml': '.toml',
            'xml': '.xml',
            # Database
            'sql': '.sql',
            'graphql': '.graphql',
            # Shell
            'bash': '.sh',
            'powershell': '.ps1',
            # Build
            'docker': 'Dockerfile',
            'make': 'Makefile',
            # Documentation
            'markdown': '.md',
            # Other
            'elixir': '.ex',
            'haskell': '.hs',
            'zig': '.zig',
            'solidity': '.sol',
        }
        return extensions.get(language, '.txt')

    def _extract_dependencies(self, blocks: List[CodeBlock],
                             languages: List[str]) -> Dict[str, List[str]]:
        """Extract dependencies from code blocks"""
        deps: Dict[str, Set[str]] = {}

        for block in blocks:
            lang = self._normalize_language(block.language)

            if lang == 'python':
                self._extract_python_deps(block.content, deps)
            elif lang in ('javascript', 'typescript'):
                self._extract_node_deps(block.content, deps)
            elif lang == 'go':
                self._extract_go_deps(block.content, deps)
            elif lang == 'java':
                self._extract_java_deps(block.content, deps)
            elif lang == 'rust':
                self._extract_rust_deps(block.content, deps)
            elif lang == 'ruby':
                self._extract_ruby_deps(block.content, deps)
            elif lang == 'php':
                self._extract_php_deps(block.content, deps)
            elif lang == 'csharp':
                self._extract_csharp_deps(block.content, deps)
            elif lang == 'swift':
                self._extract_swift_deps(block.content, deps)
            elif lang == 'kotlin':
                self._extract_kotlin_deps(block.content, deps)

        return {k: sorted(list(v)) for k, v in deps.items()}

    def _extract_python_deps(self, code: str, deps: Dict[str, Set[str]]):
        """Extract Python dependencies"""
        imports = set()

        # import module
        for match in re.findall(r'^import\s+(\w+)', code, re.MULTILINE):
            imports.add(match)

        # from module import ...
        for match in re.findall(r'^from\s+(\w+)', code, re.MULTILINE):
            imports.add(match)

        # Filter out stdlib
        third_party = imports - self.PYTHON_STDLIB

        if third_party:
            deps.setdefault('python', set()).update(third_party)

    def _extract_node_deps(self, code: str, deps: Dict[str, Set[str]]):
        """Extract Node.js dependencies"""
        imports = set()

        # import ... from 'module'
        for match in re.findall(r'from\s+[\'"]([^\'"/]+)', code):
            imports.add(match)

        # require('module')
        for match in re.findall(r'require\s*\(\s*[\'"]([^\'"/]+)', code):
            imports.add(match)

        # Filter out built-ins and relative imports
        third_party = {i for i in imports if not i.startswith('.') and i not in self.NODE_BUILTIN}

        if third_party:
            deps.setdefault('node', set()).update(third_party)

    def _extract_go_deps(self, code: str, deps: Dict[str, Set[str]]):
        """Extract Go dependencies"""
        imports = set()

        # Single import: import "package"
        for match in re.findall(r'import\s+["`]([^"`]+)["`]', code):
            imports.add(match)

        # Multi-line import block: import ( "package1" "package2" )
        import_block = re.search(r'import\s*\((.*?)\)', code, re.DOTALL)
        if import_block:
            for match in re.findall(r'["`]([^"`]+)["`]', import_block.group(1)):
                imports.add(match)

        # Filter out standard library (check if starts with stdlib package)
        third_party = set()
        for imp in imports:
            # Standard library packages don't have dots
            # Third-party packages typically have domain-like paths
            base_pkg = imp.split('/')[0]
            if '.' in base_pkg or base_pkg not in self.GO_STDLIB:
                third_party.add(imp)

        if third_party:
            deps.setdefault('go', set()).update(third_party)

    def _extract_java_deps(self, code: str, deps: Dict[str, Set[str]]):
        """Extract Java dependencies"""
        imports = set()

        # import package.Class;
        for match in re.findall(r'import\s+(?:static\s+)?([a-zA-Z_][\w.]*);', code):
            imports.add(match)

        # Filter out standard library
        third_party = set()
        for imp in imports:
            # Check if it starts with a standard package prefix
            base_pkg = imp.split('.')[0]
            is_stdlib = any(imp.startswith(prefix) for prefix in self.JAVA_STDLIB)
            if not is_stdlib:
                # Extract the group/artifact (first two parts typically)
                parts = imp.split('.')
                if len(parts) >= 2:
                    third_party.add(f"{parts[0]}.{parts[1]}")

        if third_party:
            deps.setdefault('java', set()).update(third_party)

    def _extract_rust_deps(self, code: str, deps: Dict[str, Set[str]]):
        """Extract Rust dependencies"""
        imports = set()

        # use crate_name::...
        for match in re.findall(r'use\s+(\w+)::', code):
            imports.add(match)

        # extern crate crate_name;
        for match in re.findall(r'extern\s+crate\s+(\w+)', code):
            imports.add(match)

        # Filter out standard library crates
        rust_stdlib = {'std', 'core', 'alloc', 'proc_macro', 'test', 'self', 'super', 'crate'}
        third_party = imports - rust_stdlib

        if third_party:
            deps.setdefault('rust', set()).update(third_party)

    def _extract_ruby_deps(self, code: str, deps: Dict[str, Set[str]]):
        """Extract Ruby dependencies"""
        imports = set()

        # require 'gem_name'
        for match in re.findall(r'require\s+[\'"]([^\'"]+)[\'"]', code):
            imports.add(match)

        # require_relative is for local files, skip those
        # gem 'gem_name' in Gemfile
        for match in re.findall(r'gem\s+[\'"]([^\'"]+)[\'"]', code):
            imports.add(match)

        # Filter out standard library (common ones)
        ruby_stdlib = {
            'json', 'yaml', 'csv', 'net/http', 'uri', 'fileutils', 'pathname',
            'time', 'date', 'set', 'ostruct', 'optparse', 'logger', 'erb',
            'digest', 'base64', 'securerandom', 'socket', 'stringio', 'tempfile',
            'benchmark', 'pp', 'open-uri', 'open3', 'timeout', 'thread',
        }
        third_party = imports - ruby_stdlib

        if third_party:
            deps.setdefault('ruby', set()).update(third_party)

    def _extract_php_deps(self, code: str, deps: Dict[str, Set[str]]):
        """Extract PHP dependencies (Composer packages)"""
        imports = set()

        # use Namespace\Class;
        for match in re.findall(r'use\s+([A-Z][a-zA-Z0-9_\\]+)', code):
            # Extract top-level namespace (usually vendor/package)
            parts = match.split('\\')
            if len(parts) >= 2:
                imports.add(f"{parts[0].lower()}/{parts[1].lower()}")

        # require/include statements with paths aren't dependencies

        if imports:
            deps.setdefault('php', set()).update(imports)

    def _extract_csharp_deps(self, code: str, deps: Dict[str, Set[str]]):
        """Extract C#/.NET dependencies (NuGet packages)"""
        imports = set()

        # using Namespace;
        for match in re.findall(r'using\s+([A-Z][a-zA-Z0-9_.]+)\s*;', code):
            imports.add(match)

        # Filter out standard .NET namespaces
        dotnet_stdlib = {
            'System', 'Microsoft', 'Windows',
        }
        third_party = set()
        for imp in imports:
            base = imp.split('.')[0]
            if base not in dotnet_stdlib:
                third_party.add(imp)

        if third_party:
            deps.setdefault('csharp', set()).update(third_party)

    def _extract_swift_deps(self, code: str, deps: Dict[str, Set[str]]):
        """Extract Swift dependencies (Swift Package Manager)"""
        imports = set()

        # import Module
        for match in re.findall(r'import\s+(\w+)', code):
            imports.add(match)

        # Filter out standard Swift/Apple frameworks
        swift_stdlib = {
            'Foundation', 'UIKit', 'SwiftUI', 'Combine', 'CoreData', 'CoreGraphics',
            'CoreLocation', 'MapKit', 'AVFoundation', 'WebKit', 'Security',
            'SystemConfiguration', 'CoreMotion', 'CoreBluetooth', 'HealthKit',
            'StoreKit', 'GameKit', 'SpriteKit', 'SceneKit', 'ARKit', 'RealityKit',
            'Metal', 'MetalKit', 'Accelerate', 'CoreML', 'Vision', 'NaturalLanguage',
            'Darwin', 'Dispatch', 'os', 'Swift', 'XCTest',
        }
        third_party = imports - swift_stdlib

        if third_party:
            deps.setdefault('swift', set()).update(third_party)

    def _extract_kotlin_deps(self, code: str, deps: Dict[str, Set[str]]):
        """Extract Kotlin dependencies"""
        imports = set()

        # import package.Class
        for match in re.findall(r'import\s+([a-zA-Z_][\w.]*)', code):
            imports.add(match)

        # Filter out standard Kotlin/Java packages
        kotlin_stdlib = {'kotlin', 'java', 'javax', 'android', 'androidx', 'kotlinx'}
        third_party = set()
        for imp in imports:
            base = imp.split('.')[0]
            if base not in kotlin_stdlib:
                # Extract the group (first two parts typically)
                parts = imp.split('.')
                if len(parts) >= 2:
                    third_party.add(f"{parts[0]}.{parts[1]}")

        if third_party:
            deps.setdefault('kotlin', set()).update(third_party)

    def _infer_directories(self, files: Dict[str, str]) -> List[str]:
        """Infer directory structure from files"""
        directories = set()

        for path in files.keys():
            parts = path.split('/')
            for i in range(1, len(parts)):
                directories.add('/'.join(parts[:i]))

        return sorted(list(directories))

    def _infer_project_name(self, session: Session) -> str:
        """Infer project name from session"""
        # Try to find project name in first few messages
        for msg in session.messages[:5]:
            content = msg.content.lower()

            # Look for explicit project mentions
            patterns = [
                r'project[:\s]+["\']?(\w+)',
                r'called\s+["\']?(\w+)',
                r'named\s+["\']?(\w+)',
                r'building\s+(?:a\s+)?(\w+)',
            ]

            for pattern in patterns:
                match = re.search(pattern, content)
                if match:
                    return match.group(1).replace(' ', '-').lower()

        # Default based on source
        return f"ucts-{session.source}-project"

    def _summarize_session(self, session: Session) -> str:
        """Generate a brief session summary"""
        if not session.messages:
            return "Project generated from AI conversation"

        # Get first user message for context
        for msg in session.messages:
            if msg.role == 'user' and len(msg.content) > 20:
                summary = msg.content[:200]
                if len(msg.content) > 200:
                    summary += "..."
                return summary

        return "Project generated from AI conversation"

    def _generate_readme(self, session: Session, languages: List[str],
                         blocks: List[CodeBlock]) -> str:
        """Generate README.md content"""
        name = self._infer_project_name(session)
        description = self._summarize_session(session)

        readme = f"""# {name}

{description}

## Overview

This project was generated from an AI conversation using UCTS (Universal Context Transfer System).

"""

        if languages:
            readme += f"## Languages\n\n"
            for lang in languages:
                readme += f"- {lang.title()}\n"
            readme += "\n"

        if session.todos:
            readme += "## TODOs\n\n"
            for todo in session.todos[:10]:
                readme += f"- [ ] {todo}\n"
            readme += "\n"

        readme += """## Getting Started

```bash
# Clone the repository
git clone <repo-url>
cd """ + name + """

# Install dependencies (if applicable)
pip install -r requirements.txt  # Python
npm install                       # Node.js
```

## Generated by UCTS

This project was automatically generated from an AI conversation.
For more information, visit the [UCTS documentation](https://gitlab.com/sentinel-protocol/ucts).
"""

        return readme
