import os
import json
import fitz  # PyMuPDF
import re
import logging
from typing import List, Dict, Any

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def extract_text_from_pdf(pdf_path: str) -> str:
    """Extract text from PDF file"""
    try:
        logger.info(f"Opening PDF: {pdf_path}")
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()
        logger.info(f"Successfully extracted text from {pdf_path}")
        return text
    except Exception as e:
        logger.error(f"Error extracting text from {pdf_path}: {str(e)}")
        return ""

def create_program_qa_pairs(text: str, program_code: str, program_name: str, program_type: str = "undergraduate") -> List[Dict[str, Any]]:
    """Create Q&A pairs for a specific program"""
    qa_pairs = []
    
    # Program Overview
    qa_pairs.append({
        "intent": f"{program_code.lower()}_program_overview",
        "questions": [
            f"What is the {program_name} program about?",
            f"Tell me about {program_code} at ABU DLC",
            f"What can I expect from the {program_name} program?",
            f"Describe the {program_name} program",
            f"Give me an overview of {program_code}"
        ],
        "answer": f"The {program_name} ({program_code}) program at ABU DLC is designed to provide students with comprehensive knowledge and skills in {program_name}. The program is fully accredited by the National Universities Commission (NUC)."
    })
    
    # Admission Requirements
    admission_answer = ""
    if program_type == "undergraduate":
        admission_answer = f"To be admitted into the {program_name} program, you need:\n\n1. UTME Entry:\n- Five O'Level credits including English and Mathematics\n- Acceptable UTME score in relevant subjects\n\n2. Direct Entry:\n- Relevant ND/HND/NCE qualification\n- Meet other DE requirements"
    else:
        admission_answer = f"To be admitted into the {program_name} program, you need:\n\n1. Entry Requirements:\n- Bachelor's degree in relevant field\n- Minimum of Second Class Lower\n- Professional qualifications may be considered\n\n2. Additional Requirements:\n- Work experience may be required\n- Letters of recommendation\n- Statement of purpose"
    
    qa_pairs.append({
        "intent": f"{program_code.lower()}_admission_requirements",
        "questions": [
            f"What are the admission requirements for {program_code}?",
            f"How can I get admission into {program_code} program?",
            f"What do I need to study {program_code} at ABU DLC?",
            f"Tell me the entry requirements for {program_name}",
            f"Qualifications needed for {program_code} program"
        ],
        "answer": admission_answer
    })
    
    # Course Structure
    qa_pairs.append({
        "intent": f"{program_code.lower()}_course_structure",
        "questions": [
            f"What courses will I take in {program_code}?",
            f"Show me the {program_code} course structure",
            f"What is the curriculum for {program_name}?",
            f"List the courses in {program_code} program",
            f"What subjects are taught in {program_code}?"
        ],
        "answer": f"The {program_name} program consists of:\n\n1. Core Courses:\n- Program-specific required courses\n- Professional courses\n\n2. General Studies:\n- Communication skills\n- ICT courses\n- Entrepreneurship\n\n3. Elective Courses:\n- Specialized options\n- Career-focused courses\n\n4. Research Component:\n- Research methodology\n- Final year project/dissertation"
    })
    
    # Duration and Mode
    duration = "4" if program_type == "undergraduate" else "2"
    qa_pairs.append({
        "intent": f"{program_code.lower()}_duration_mode",
        "questions": [
            f"How long is the {program_code} program?",
            f"What is the duration of {program_name}?",
            f"How many years for {program_code}?",
            f"Tell me about {program_code} program duration",
            f"How is {program_code} program delivered?"
        ],
        "answer": f"The {program_name} program:\n\n1. Duration:\n- {'UTME Entry: ' + duration + ' years' if program_type == 'undergraduate' else 'Full-time: ' + duration + ' years'}\n- {'Direct Entry: Shorter duration based on entry level' if program_type == 'undergraduate' else 'Part-time: Extended duration available'}\n\n2. Mode of Study:\n- Online learning platform\n- Virtual classrooms\n- Interactive sessions\n- Periodic face-to-face meetings"
    })
    
    # Assessment Methods
    qa_pairs.append({
        "intent": f"{program_code.lower()}_assessment_methods",
        "questions": [
            f"How are {program_code} students assessed?",
            f"What are the evaluation methods in {program_code}?",
            f"How do you pass {program_code} courses?",
            f"Explain {program_code} examination process",
            f"What are the grading criteria for {program_code}?"
        ],
        "answer": f"Assessment in the {program_name} program includes:\n\n1. Continuous Assessment (40%):\n- Assignments\n- Mid-semester tests\n- Presentations\n- Online quizzes\n\n2. Final Examination (60%):\n- End of semester exams\n- Comprehensive assessment\n\n3. Other Components:\n- Project work\n- Practical assessments\n- Field reports"
    })
    
    # Career Prospects
    qa_pairs.append({
        "intent": f"{program_code.lower()}_career_prospects",
        "questions": [
            f"What can I do with a {program_code} degree?",
            f"Career opportunities after {program_code}",
            f"Job prospects for {program_code} graduates",
            f"Where can I work after {program_code}?",
            f"Employment options with {program_code}"
        ],
        "answer": f"Graduates of the {program_name} program have various career opportunities:\n\n1. Professional Roles:\n- Industry-specific positions\n- Management roles\n- Consultancy services\n\n2. Entrepreneurship:\n- Private practice\n- Business ventures\n- Consulting firms\n\n3. Further Studies:\n- Advanced degrees\n- Professional certifications\n- Specialization programs"
    })
    
    return qa_pairs

def process_handbook(pdf_path: str, output_path: str):
    """Process a single handbook and create Q&A pairs"""
    try:
        # Extract program code and name from filename
        filename = os.path.basename(pdf_path)
        program_info = {
            # Undergraduate Programs
            "BSc Accounting": ("BSC-ACC", "Bachelor of Science in Accounting", "undergraduate"),
            "BSc Business Administration": ("BSC-BUS", "Bachelor of Science in Business Administration", "undergraduate"),
            "BSc Computer Science": ("BSC-CSC", "Bachelor of Science in Computer Science", "undergraduate"),
            "BSc Economics": ("BSC-ECO", "Bachelor of Science in Economics", "undergraduate"),
            "BSc International Studies": ("BSC-INT", "Bachelor of Science in International Studies", "undergraduate"),
            "BSc Mass Comm": ("BSC-MCM", "Bachelor of Science in Mass Communication", "undergraduate"),
            "BSc Political Science": ("BSC-POL", "Bachelor of Science in Political Science", "undergraduate"),
            "BSc Public Administration": ("BSC-PAD", "Bachelor of Science in Public Administration", "undergraduate"),
            "BSc Sociology": ("BSC-SOC", "Bachelor of Science in Sociology", "undergraduate"),
            "BNSc": ("BNSC", "Bachelor of Nursing Science", "undergraduate"),
            "BLIS": ("BLIS", "Bachelor of Library and Information Science", "undergraduate"),
            
            # Postgraduate Programs
            "MBA": ("MBA", "Master of Business Administration", "postgraduate"),
            "MPA": ("MPA", "Master of Public Administration", "postgraduate"),
            "MPH": ("MPH", "Master of Public Health", "postgraduate"),
            "MIM": ("MIM", "Master of Information Management", "postgraduate"),
            "MIAD": ("MIAD", "Master in International Affairs and Diplomacy", "postgraduate"),
            "MLCJ": ("MLCJ", "Master in Law Enforcement and Criminal Justice", "postgraduate"),
            "MAC": ("MAC", "Master in Accounting", "postgraduate"),
            "PGDE": ("PGDE", "Postgraduate Diploma in Education", "postgraduate"),
            "PGDM": ("PGDM", "Postgraduate Diploma in Management", "postgraduate"),
            "PGDIM": ("PGDIM", "Postgraduate Diploma in Information Management", "postgraduate"),
            "CDRM": ("CDRM", "Certificate in Disaster Risk Management", "certificate")
        }
        
        # Determine program code and name
        program_code = None
        program_name = None
        program_type = None
        
        for key, (code, name, type_) in program_info.items():
            if key.lower() in filename.lower():
                program_code = code
                program_name = name
                program_type = type_
                break
        
        if not program_code:
            logger.warning(f"Could not determine program code for {filename}")
            return
        
        # Extract text from PDF
        text = extract_text_from_pdf(pdf_path)
        if not text:
            logger.error(f"No text extracted from {pdf_path}")
            return
            
        # Create Q&A pairs
        qa_pairs = create_program_qa_pairs(text, program_code, program_name, program_type)
        
        # Save to JSON file
        output_file = os.path.join(output_path, f"{program_code.lower()}_qa.json")
        with open(output_file, "w", encoding="utf-8") as f:
            json.dump(qa_pairs, f, indent=4)
        logger.info(f"Created Q&A pairs in {output_file}")
        
    except Exception as e:
        logger.error(f"Error processing {pdf_path}: {str(e)}")

def main():
    try:
        # Create output directory if it doesn't exist
        documents_dir = "documents"
        os.makedirs(documents_dir, exist_ok=True)
        
        # Process all PDF files in documents directory
        for filename in os.listdir(documents_dir):
            if filename.lower().endswith(".pdf"):
                pdf_path = os.path.join(documents_dir, filename)
                process_handbook(pdf_path, documents_dir)
                
    except Exception as e:
        logger.error(f"Error in main: {str(e)}")

if __name__ == "__main__":
    main() 