
    9j"                     "    d dl Z dedefdZd Zy)    Ntextreturnc                     | sy| j                         } t        j                  dd|       } t        j                  dd|       } | j                         S )z,Clean OCR text by removing noise characters. z^[|:\-.,_]+z[|:\-.,_]+$)stripresub)r   s    ,/media/conek/DATA/Code/OCR/backend/parser.py
clean_textr      sC    ::<D66."d+D66."d+D::<    c           	      "
   | D cg c]  }t        |d          }}g }| D ])  \  }}}t        |      }|s|j                  |||d       + t        d       |D ]  }t        d|d    d|d   dd	        d
d
d
d
dd
d
d
d}	t        j                  d      }
|D ]@  }|d   j                  dd
      }|
j                  |      }|s,|j                  d      |	d<    n |	d   s6|D ]1  }t        j                  dd
|d         }t        |      dk(  s,||	d<    n t        j                  d      }g }|D ]:  }|j                  |d         }|D ]  }|j                  ||d   d   d   d       ! < |j                  d        t        |      dk\  r|d   d   |	d<   t        |      dk\  r|d   d   |	d<   |D ]S  }|d   j                         }d|v r!dt        j                  dd
|      k(  rd|	d<    nd |v sd!|v sDd"|vsId#|vsNd$|	d<    n |	d   sB|D ]=  }|d   j                         }d%|v rd&|vrd|	d<    nd'|v sd(|v s.d)|vs3d&|vs8d$|	d<    n g d*}g }|D ]  }|d   }|j                         st        |      d+kD  s(d,}|D ]
  }||v sd-} n t        d. |D              rd-}|rP|j!                         }dt        |      cxk  rd/k  sun x|j                  ||d   d   d   |d   d0        |j                  d1        |r|d   d   |	d2<   g }g }t#        |      D ]S  \  }}|d   j                         }d3|v sd4|v sd5|v r|j                  |       d6|v s	d7|v sd8|v sC|j                  |       U |rt%        |      dz   }g }t'        |t        |            D ]Y  }||   }|d   j                         t        fd9d:D              r n+d;v r6|j                  |d          t        |      dk\  sY n d<j)                  |      |	d=<   |rt%        |      dz   }g }t'        |t        |            D ]w  }||   }|d   j                         t        fd>d?D              r nI|j                  |d         r
||dz   kD  r n+d@v rT|j                  |d          t        |      dAk\  sw n d<j)                  |      |	dB<   |	d=   rt        j                  dCd<|	d=         |	d=<   |	dB   rt        j                  dCd<|	dB         |	dB<   |	S c c}w )Dz
    Extract structured CCCD information from EasyOCR raw output.
    ocr_results is a list of tuples: (bbox, text, confidence)
       )r   bbox
confidencezCleaned OCR Texts:z- r   z (conf: r   z.2f)r   
   VIỆT NAM)	id_number	full_namedobsexnationalityplace_of_originplace_of_residenceexpiry_datez
\b\d{12}\b r   r   z\D   z\b\d{2}/\d{2}/\d{4}\br   )datey_coordc                     | d   S Nr    xs    r
   <lambda>z#extract_cccd_info.<locals>.<lambda>S   s
    1Y< r   )keyr   r      r   namz[^a-z]Namr   u   nữnuu   việtvietu   NữNAMVIETu   NỮNUu   VIỆT)/u   CỘNG HÒAu
   CỘNG HOAu	   XÃ HỘIu   CHỦ NGHĨAr   zVIET NAMu   ĐỘC LẬPu   TỰ DOu   HẠNH PHÚCzDOC LAPzTU DOz	HANH PHUCu   CĂN CƯỚCu
   CÔNG DÂNzCAN CUOCzCONG DANCITIZENIDENTITYCARDu   SỐNOu   HỌ VÀ TÊNz	HO VA TENz	FULL NAMEu
   NGÀY SINHz	NGAY SINHzDATE OF BIRTHu   GIỚI TÍNHz	GIOI TINHSEXu   QUỐC TỊCHz	QUOC TICHNATIONALITYu
   QUÊ QUÁNzQUE QUANzPLACE OF ORIGINu   NƠI THƯỜNG TRÚzNOI THUONG TRUzPLACE OF RESIDENCEu   CÓ GIÁ TRỊ ĐẾNzCO GIA TRI DENu   CỤC TRƯỞNGz
CUC TRUONGu   CẢNH SÁTzCANH SATu
   ĐĂNG KÝzDANG KY   FTc              3   <   K   | ]  }|j                           y wN)isdigit).0chars     r
   	<genexpr>z$extract_cccd_info.<locals>.<genexpr>   s     3d4<<>3s      )r   r   r   c                     | d   S r    r!   r"   s    r
   r$   z#extract_cccd_info.<locals>.<lambda>   s
    a	l r   r   
   quê quánzque quanplace of origin   thường trú
thuong truplace of residencec              3   &   K   | ]  }|v  
 y wr6   r!   r8   kwnext_txt_lowers     r
   r:   z$extract_cccd_info.<locals>.<genexpr>   s       LB2'  L   )r?   r@   	residence
   giá trịexpiry   cục trưởng)r>   r=   z, r   c              3   &   K   | ]  }|v  
 y wr6   r!   rC   s     r
   r:   z$extract_cccd_info.<locals>.<genexpr>   s     zB2'zrF   )rH   rI   rJ   u	   ký ngàyu   ngày cấp)rA   u   nơi thường trú   r   z\s*,\s*)r   appendprintr   compilereplacesearchgroupr	   lenfindallsortlowerupperisupperanysplit	enumeratemaxrangejoin)ocr_resultsres	raw_textsitemsr   r   confcleanediteminfo
id_patternmatchdate_patternfound_datesmatchesmtxtname_exclude_keywordspossible_namesexcluderD   wordsorigin_indicesresidence_indicesidx	start_idxorigin_linesi	next_itemresidence_linesrE   s                                 @r
   extract_cccd_inforz      sO    0;;CF#;I; E' dDT"LL"  

 D4<.l);C(@BCD
 # 	D M*J F|##C,!!$' %AD  	D66%T&\2D4yB$([!		 ::67LK  &&tF|4 	A<?1-  	 /0
;1!!nV,U
;1)!nV4]  	6l  "C<ERVVIr3%??DKs]dcks"vS'8$U	 ; 	Dv,$$&C|c 1#U3$#+3&6+<"(DK		 N F|<<>c$i!mG+ :"G
 3d33

E
'a'")) $#'<?1#5&*<&8+ %6 23*1-f5[ Nu% *	T6l  "3*"37HC7O!!#&s"lc&9=QUX=X$$S)* '!+	y#e*- 	AaI&v.446N   L  3K  L  L!BB	& 12< A%	 #'))L"9 )*Q.	y#e*- 	AaI&v.446N z2yzz""9V#45!i!m:K!NN""9V#45?#q(	 &*YY%?!" "$&&T4@Q;R"S !%'VVJdCW>X%Y!"KY <s   T)r   strr   rz   r!   r   r
   <module>r|      s!    	S S Rr   