import os
import json
import fitz  # PyMuPDF
import re
import logging

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

def extract_text_from_pdf(pdf_path):
    """Extract text from PDF file"""
    try:
        logger.info(f"Opening PDF: {pdf_path}")
        doc = fitz.open(pdf_path)
        text = ""
        for page in doc:
            text += page.get_text()
        doc.close()
        logger.info(f"Successfully extracted text from {pdf_path}")
        return text
    except Exception as e:
        logger.error(f"Error extracting text from {pdf_path}: {str(e)}")
        return None

def create_qa_pairs_accounting(text):
    """Create Q&A pairs for BSc Accounting handbook"""
    try:
        qa_pairs = []
        
        # Course Information
        qa_pairs.append({
            "intent": "bsc_accounting_overview",
            "questions": [
                "What is BSc Accounting at ABU DLC?",
                "Tell me about the BSc Accounting program",
                "What can I learn in BSc Accounting?",
                "What is the overview of BSc Accounting program?",
                "Describe the BSc Accounting course at ABU DLC"
            ],
            "answer": "The BSc Accounting program at ABU DLC is designed to provide students with comprehensive knowledge of accounting principles, practices, and technologies. The program is fully accredited by the NUC and professional accounting bodies."
        })
        
        if not text:
            logger.warning("No text content found in accounting handbook")
            return qa_pairs
            
        # Extract course duration if found
        duration_match = re.search(r"duration.*?(\d+).*?years", text.lower())
        if duration_match:
            qa_pairs.append({
                "intent": "bsc_accounting_duration",
                "questions": [
                    "How long is the BSc Accounting program?",
                    "What is the duration of BSc Accounting?",
                    "How many years does it take to complete BSc Accounting?",
                    "What's the program duration for BSc Accounting?",
                    "Time required for BSc Accounting completion?"
                ],
                "answer": f"The BSc Accounting program duration is {duration_match.group(1)} years."
            })
        
        # Extract course structure if found
        course_structure = re.findall(r"ACC\d{3}.*?(?=ACC\d{3}|$)", text)
        if course_structure:
            qa_pairs.append({
                "intent": "bsc_accounting_courses",
                "questions": [
                    "What courses are offered in BSc Accounting?",
                    "List the courses for BSc Accounting",
                    "What subjects will I study in BSc Accounting?",
                    "Show me the course structure for BSc Accounting",
                    "What are the modules in BSc Accounting?"
                ],
                "answer": "The BSc Accounting program includes the following courses:\n" + 
                         "\n".join([course.strip() for course in course_structure[:10]])
            })
        
        logger.info(f"Created {len(qa_pairs)} Q&A pairs for accounting")
        return qa_pairs
    except Exception as e:
        logger.error(f"Error creating accounting Q&A pairs: {str(e)}")
        return []

def create_qa_pairs_nursing(text):
    """Create Q&A pairs for BNSc Nursing handbook"""
    try:
        qa_pairs = []
        
        # Program Overview
        qa_pairs.append({
            "intent": "bnsc_nursing_overview",
            "questions": [
                "What is BNSc Nursing at ABU DLC?",
                "Tell me about the BNSc Nursing program",
                "What can I learn in BNSc Nursing?",
                "What is the overview of BNSc Nursing program?",
                "Describe the BNSc Nursing course at ABU DLC"
            ],
            "answer": "The Bachelor of Nursing Science (BNSc) program at ABU DLC is designed to produce professional nurses with comprehensive theoretical knowledge and practical skills in nursing care. The program is fully accredited by the NUC and Nursing and Midwifery Council of Nigeria."
        })
        
        if not text:
            logger.warning("No text content found in nursing handbook")
            return qa_pairs
            
        # Extract clinical requirements if found
        clinical_match = re.search(r"clinical.*?practice.*?requirements?", text.lower())
        if clinical_match:
            qa_pairs.append({
                "intent": "bnsc_clinical_requirements",
                "questions": [
                    "What are the clinical requirements for BNSc?",
                    "Tell me about clinical practice in BNSc Nursing",
                    "What clinical experience do I need for BNSc?",
                    "Clinical practice requirements for nursing students",
                    "How does clinical training work in BNSc?"
                ],
                "answer": "The BNSc program includes mandatory clinical practice requirements. Students must complete supervised clinical rotations in approved healthcare facilities to gain practical nursing experience."
            })
        
        logger.info(f"Created {len(qa_pairs)} Q&A pairs for nursing")
        return qa_pairs
    except Exception as e:
        logger.error(f"Error creating nursing Q&A pairs: {str(e)}")
        return []

def main():
    try:
        # Create output directory if it doesn't exist
        os.makedirs("documents", exist_ok=True)
        
        # Process BSc Accounting handbook
        accounting_pdf = "documents/BSc Accounting Student Handbook.pdf"
        if os.path.exists(accounting_pdf):
            logger.info(f"Processing {accounting_pdf}")
            text = extract_text_from_pdf(accounting_pdf)
            if text:
                qa_pairs = create_qa_pairs_accounting(text)
                output_file = "documents/accounting_qa.json"
                with open(output_file, "w", encoding="utf-8") as f:
                    json.dump(qa_pairs, f, indent=4)
                logger.info(f"Created Q&A pairs in {output_file}")
        else:
            logger.warning(f"File not found: {accounting_pdf}")
        
        # Process BNSc handbook
        nursing_pdf = "documents/BNSc Students Handbook.pdf"
        if os.path.exists(nursing_pdf):
            logger.info(f"Processing {nursing_pdf}")
            text = extract_text_from_pdf(nursing_pdf)
            if text:
                qa_pairs = create_qa_pairs_nursing(text)
                output_file = "documents/nursing_qa.json"
                with open(output_file, "w", encoding="utf-8") as f:
                    json.dump(qa_pairs, f, indent=4)
                logger.info(f"Created Q&A pairs in {output_file}")
        else:
            logger.warning(f"File not found: {nursing_pdf}")
            
    except Exception as e:
        logger.error(f"Error in main: {str(e)}")

if __name__ == "__main__":
    main() 