import re

def clean_text(text: str) -> str:
    """Clean OCR text by removing noise characters."""
    if not text:
        return ""
    # Remove extra spaces and common OCR artifact characters at boundaries
    text = text.strip()
    text = re.sub(r'^[|:\-.,_]+', '', text)
    text = re.sub(r'[|:\-.,_]+$', '', text)
    return text.strip()

def extract_cccd_info(ocr_results):
    """
    Extract structured CCCD information from EasyOCR raw output.
    ocr_results is a list of tuples: (bbox, text, confidence)
    """
    # 1. Reconstruct texts and bounding boxes
    raw_texts = [clean_text(res[1]) for res in ocr_results]
    
    # Filter out empty strings
    items = []
    for bbox, text, conf in ocr_results:
        cleaned = clean_text(text)
        if cleaned:
            items.append({
                "text": cleaned,
                "bbox": bbox,
                "confidence": conf
            })
            
    # Print for debugging
    print("Cleaned OCR Texts:")
    for item in items:
        print(f"- {item['text']} (conf: {item['confidence']:.2f})")

    # Initializing result dict
    info = {
        "id_number": "",
        "full_name": "",
        "dob": "",
        "sex": "",
        "nationality": "VIỆT NAM",
        "place_of_origin": "",
        "place_of_residence": "",
        "expiry_date": ""
    }

    # 2. Extract 12-digit ID number
    id_pattern = re.compile(r'\b\d{12}\b')
    for item in items:
        text = item["text"].replace(" ", "")
        match = id_pattern.search(text)
        if match:
            info["id_number"] = match.group(0)
            break
    
    # Fallback for ID number: sometimes OCR puts spaces or misidentifies digits
    if not info["id_number"]:
        # Look for any block that is mostly numbers and about 12 chars
        for item in items:
            text = re.sub(r'\D', '', item["text"])
            if len(text) == 12:
                info["id_number"] = text
                break

    # 3. Extract dates (DOB and Expiry)
    # DOB: usually after "Ngày, tháng, năm sinh" or "Date of birth"
    # Expiry: usually after "Có giá trị đến" or "Date of expiry"
    date_pattern = re.compile(r'\b\d{2}/\d{2}/\d{4}\b')
    found_dates = []
    
    # Let's find all dates in order
    for item in items:
        matches = date_pattern.findall(item["text"])
        for m in matches:
            found_dates.append({
                "date": m,
                "y_coord": item["bbox"][0][1] # Get vertical Y coordinate
            })
            
    # Sort dates by vertical coordinate (top to bottom)
    found_dates.sort(key=lambda x: x["y_coord"])
    
    if len(found_dates) >= 1:
        info["dob"] = found_dates[0]["date"]
    if len(found_dates) >= 2:
        info["expiry_date"] = found_dates[1]["date"]

    # 4. Extract Sex
    for item in items:
        txt = item["text"].lower()
        if "nam" in txt and "nam" == re.sub(r'[^a-z]', '', txt):
            info["sex"] = "Nam"
            break
        elif "nữ" in txt or "nu" in txt:
            # check if it is not "vietnam"
            if "việt" not in txt and "viet" not in txt:
                info["sex"] = "Nữ"
                break
                
    # Fallback for Sex
    if not info["sex"]:
        for item in items:
            txt = item["text"].upper()
            if "NAM" in txt and "VIET" not in txt:
                info["sex"] = "Nam"
                break
            elif "NỮ" in txt or "NU" in txt:
                if "VIỆT" not in txt and "VIET" not in txt:
                    info["sex"] = "Nữ"
                    break

    # 5. Extract Full Name
    # Full name in CCCD is UPPERCASE, usually 2-5 words.
    # Exclude headers and standard texts.
    name_exclude_keywords = [
        "CỘNG HÒA", "CỘNG HOA", "XÃ HỘI", "CHỦ NGHĨA", "VIỆT NAM", "VIET NAM",
        "ĐỘC LẬP", "TỰ DO", "HẠNH PHÚC", "DOC LAP", "TU DO", "HANH PHUC",
        "CĂN CƯỚC", "CÔNG DÂN", "CAN CUOC", "CONG DAN", "CITIZEN", "IDENTITY", "CARD",
        "SỐ", "NO", "HỌ VÀ TÊN", "HO VA TEN", "FULL NAME", "NGÀY SINH", "NGAY SINH",
        "DATE OF BIRTH", "GIỚI TÍNH", "GIOI TINH", "SEX", "QUỐC TỊCH", "QUOC TICH",
        "NATIONALITY", "QUÊ QUÁN", "QUE QUAN", "PLACE OF ORIGIN", "NƠI THƯỜNG TRÚ",
        "NOI THUONG TRU", "PLACE OF RESIDENCE", "CÓ GIÁ TRỊ ĐẾN", "CO GIA TRI DEN",
        "CỤC TRƯỞNG", "CUC TRUONG", "CẢNH SÁT", "CANH SAT", "ĐĂNG KÝ", "DANG KY"
    ]
    
    possible_names = []
    for item in items:
        text = item["text"]
        # Must be uppercase
        if text.isupper() and len(text) > 4:
            # Check if it contains any exclude keywords
            exclude = False
            for kw in name_exclude_keywords:
                if kw in text:
                    exclude = True
                    break
            # Must not contain digits
            if any(char.isdigit() for char in text):
                exclude = True
                
            if not exclude:
                # Calculate number of words
                words = text.split()
                if 2 <= len(words) <= 5:
                    possible_names.append({
                        "text": text,
                        "y_coord": item["bbox"][0][1],
                        "confidence": item["confidence"]
                    })
                    
    # The name is usually the highest valid uppercase block after "CĂN CƯỚC CÔNG DÂN"
    # and below the headers. It usually is above DOB.
    # Let's sort by Y coordinate
    possible_names.sort(key=lambda x: x["y_coord"])
    if possible_names:
        # Pick the first one (top-most that fits the name location)
        info["full_name"] = possible_names[0]["text"]

    # 6. Extract Place of Origin (Quê quán) & Place of Residence (Nơi thường trú)
    # Place of origin label: "Quê quán / Place of origin"
    # Place of residence label: "Nơi thường trú / Place of residence"
    # We will locate these labels and grab the text lines below or next to them.
    origin_indices = []
    residence_indices = []
    
    for idx, item in enumerate(items):
        txt = item["text"].lower()
        if "quê quán" in txt or "que quan" in txt or "place of origin" in txt:
            origin_indices.append(idx)
        if "thường trú" in txt or "thuong tru" in txt or "place of residence" in txt:
            residence_indices.append(idx)
            
    # Process Place of Origin
    if origin_indices:
        # Start reading lines after the label until we hit another main label (e.g. residence) or date
        start_idx = max(origin_indices) + 1
        origin_lines = []
        for i in range(start_idx, len(items)):
            next_item = items[i]
            next_txt_lower = next_item["text"].lower()
            
            # Stop if we hit residence or expiry or authority labels
            if any(kw in next_txt_lower for kw in ["thường trú", "thuong tru", "residence", "giá trị", "expiry", "cục trưởng"]):
                break
            # Skip if it is English label inside the block
            if next_txt_lower in ["place of origin", "quê quán"]:
                continue
                
            origin_lines.append(next_item["text"])
            if len(origin_lines) >= 2: # Place of origin is usually 1-2 lines
                break
        info["place_of_origin"] = ", ".join(origin_lines)

    # Process Place of Residence
    if residence_indices:
        start_idx = max(residence_indices) + 1
        residence_lines = []
        for i in range(start_idx, len(items)):
            next_item = items[i]
            next_txt_lower = next_item["text"].lower()
            
            # Stop if we hit expiry or authority labels or date pattern
            if any(kw in next_txt_lower for kw in ["giá trị", "expiry", "cục trưởng", "ký ngày", "ngày cấp"]):
                break
            if date_pattern.search(next_item["text"]) and i > start_idx + 1:
                break
            if next_txt_lower in ["place of residence", "nơi thường trú"]:
                continue
                
            residence_lines.append(next_item["text"])
            if len(residence_lines) >= 3: # Place of residence is usually 1-3 lines
                break
        info["place_of_residence"] = ", ".join(residence_lines)
        
    # Standardize commas and spacing
    if info["place_of_origin"]:
        info["place_of_origin"] = re.sub(r'\s*,\s*', ', ', info["place_of_origin"])
    if info["place_of_residence"]:
        info["place_of_residence"] = re.sub(r'\s*,\s*', ', ', info["place_of_residence"])

    return info
