
    \jZ0                     2   d Z ddlZddlZddlmZ ddlmZmZ 	 ddl	m
Z
 ddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ  G d de      Z G d de      Zd Zd Zd ZddZd Zd Z G d de      ZddZ	 e dk(  r ed        ed       yy# e$ r Y uw xY w)z
Named entity chunker
    N)ElementTree)ClassifierBasedTaggerpos_tag)MaxentClassifier)ChunkParserI)
ChunkScore)find)word_tokenize)Treec                   *    e Zd ZdZddZd Zd Zd Zy)NEChunkParserTaggerz2
    The IOB tagger used by the chunk parser.
    Nc                 J    t        j                  | || j                  |       y )N)trainclassifier_builder
classifier)r   __init___classifier_builder)selfr   r   s      A/root/env/lib/python3.12/site-packages/nltk/chunk/named_entity.pyr   zNEChunkParserTagger.__init__$   s"    &&#77!		
    c                 4    t        j                  |ddd      S )Niis      )	algorithmgaussian_prior_sigmatrace)r   r   r   r   s     r   r   z'NEChunkParserTagger._classifier_builder,   s!    %%!"
 	
r   c                     	 | j                   }|S # t        $ r5 ddlm} t	        |j                  d            | _         | j                   }Y |S w xY w)Nr   )wordszen-basic)_en_wordlistAttributeErrornltk.corpusr    set)r   wlr    s      r   _english_wordlistz%NEChunkParserTagger._english_wordlist5   sS    	#""B 	  	#) #EKK
$; <D""B		#s    :AAc                    ||   d   }t        ||   d         }|dk(  rd x}}d x}}	d x}
x}}n|dk(  rA||dz
     d   j                         }d }t        ||dz
     d         }d }	||dz
     d   }d x}
}nu||dz
     d   j                         }||dz
     d   j                         }t        ||dz
     d         }t        ||dz
     d         }	||dz
     }||dz
     }t        |      }
|t        |      dz
  k(  r	d x}}d x}}n|t        |      dz
  k(  r7||dz      d   j                         }||dz      d   j                         }d }d }nd||dz      d   j                         }||dz      d   j                         }||dz      d   j                         }||dz      d   j                         }i dddt        |      dt        |      d|d d	 j                         d
|dd  j                         d|d|d|| j	                         v d|d|d|d|d|d|j                          d| d| d| d|
 d| }|S )Nr   r   r   biasTshapewordlenprefix3   suffix3poswordzen-wordlistprevtagprevposnextposprevwordnextwordzword+nextpos+zpos+prevtagzshape+prevtag)simplify_poslowerr)   lenr&   )r   tokensindexhistoryr0   r/   r4   prevprevwordr2   prevprevpos	prevshaper1   prevprevtagr5   nextnextwordr3   nextnextposfeaturess                     r   _feature_detectorz%NEChunkParserTagger._feature_detector?   sC   e}Q6%=+,A:&**H|$((Gk044I4+aZeai(+113HL"6%!)#4Q#78GKeai(+G&**Ieai(+113H!%!),Q/557L"6%!)#4Q#78G&veai'8';<Keai(G!%!),KhICK!O#&**H|$((Gkc&kAo%eai(+113HUQY'*002GLKeai(+113HUQY'*002G!%!),Q/557L +A.446K
D
U4[
 s4y
 tBQx~~'	

 tBCy(
 3
 D
 DD$:$:$<<
 w
 w
 w
 
 
 tzz|nAgY7
 cU!G9-
  	{!G95!
& r   )NN)__name__
__module____qualname____doc__r   r   r&   rD    r   r   r   r      s    

8r   r   c                   8    e Zd ZdZd Zd Zd Zd Zed        Z	y)NEChunkParser2
    Expected input: list of pos-tagged words
    c                 &    | j                  |       y N)_trainr   s     r   r   zNEChunkParser.__init__   s    Er   c                 ^    | j                   j                  |      }| j                  |      }|S )z8
        Each token should be a pos-tagged word
        )_taggertag_tagged_to_parse)r   r:   taggedtrees       r   parsezNEChunkParser.parse   s-     !!&)$$V,r   c                 l    |D cg c]  }| j                  |       }}t        |      | _        y c c}w )N)r   )_parse_to_taggedr   rQ   )r   corpusss      r   rO   zNEChunkParser._train   s1    4:;q$''*;;*8 <s   1c                    t        dg       }|D ]  \  }}|dk(  r|j                  |       |j                  d      r |j                  t        |dd |g             N|j                  d      s`|rAt        |d   t               r.|d   j	                         |dd k(  r|d   j                  |       |j                  t        |dd |g              |S )zH
        Convert a list of tagged tokens to a chunk-parse tree.
        SOB-r   NI-)r   append
startswith
isinstancelabel)r   tagged_tokenssenttokrR   s        r   rS   zNEChunkParser._tagged_to_parse   s     C}% 		6HCczC %DQR3%01%JtBx648>>;KsSTSUw;VHOOC(KKSWse 45		6 r   c                 >   g }| D ]  }t        |t              rpt        |      dk(  rt        d       -|j	                  |d   d|j                          f       |dd D ]&  }|j	                  |d|j                          f       ( |j	                  |df        |S )zH
        Convert a chunk-parse tree to a list of tagged tokens.
        r   z"Warning -- empty chunk in sentencer^   r   Nr_   r]   )rc   r   r9   printra   rd   )rf   tokschildrg   s       r   rX   zNEChunkParser._parse_to_tagged   s    
  		*E%&u:?>?U1XEKKM?';<= 9 =CKK5;;=/&: ;<= UCL)		* r   N)
rE   rF   rG   rH   r   rV   rO   rS   staticmethodrX   rI   r   r   rK   rK   z   s/    9$  r   rK   c                 ,   t        j                  d| t         j                        ryt        j                  d| t         j                        ryt        j                  d| t         j                        r#| j                         ry| j	                         ryyy	)
Nz![0-9]+(\.[0-9]*)?|[0-9]*\.[0-9]+$numberz\W+$punctz\w+$upcasedowncase	mixedcaseother)rematchUNICODEistitleislower)r0   s    r   r)   r)      sc    	xx4dBJJG	'4	,	'4	,<<>\\^r   c                 N    | j                  d      ry| j                  d      d   S )NV-r   )rb   split)rZ   s    r   r7   r7      s#    ||Cwws|Ar   c                 h   | j                         }d t        |      D        }t        dg       }| D ]~  }t        |t              rP|j	                  t        |j                         g              |D ]!  }|d   j	                  |t        |      f       # c|j	                  |t        |      f        |S )Nc              3   &   K   | ]	  \  }}|  y wrN   rI   ).0r0   r/   s      r   	<genexpr>zpostag_tree.<locals>.<genexpr>   s     6s6s   r\   r`   )leavesr   r   rc   ra   rd   next)rU   r    tag_iternewtreerk   subchilds         r   postag_treer      s    KKME6wu~6H3mG 4eT"NN4r23! ?""Hd8n#=>? NNE4>234 Nr   binaryc           	   #     K   | D ]  }t        j                  |      D ]e  \  }}}|j                  d      r|r|D ]F  }|j                  d      st        t         j                  j                  ||      |      E d {    H g  y 7 w)Nbnewsz.sgm)oswalkendswithload_ace_filepathjoin)rootsfmt
skip_bnewsrootdirsfilesfs          r   load_ace_datar      s      I!# 	ID$}}W%* I::f%,RWW\\$-BCHHHI	II Is   AB
-B
<B
=B
c           	   #     K   t        dt        j                  j                  |       d           | dz   }g }t	        |      5 }t        j                  |      j                         }d d d        j                  d      D ]  }|j                  d      j                  }|j                  d      D ]v  }|j                  d      dk7  rt        |j                  d	      j                        }	t        |j                  d
      j                        dz   }
|j                  |	|
|f       x  t	        |       5 }|j                         }d d d        t        j                   dd      }d }t        j                   d||      }t        j                   dd|      }t        j                   dd|      }t        j                   dd|      }|D 	
ch c]  \  }	}
}|
 }}
}	}|dk(  rd}t#        dg       }t%        |      D ]^  \  }	}
}|	|k  r|}	|
|	k  r|j'                  t)        |||	              |j                  t#        d||	|
 j                                      |
}` |j'                  t)        ||d               | y |dk(  rd}t#        dg       }t%        |      D ]^  \  }	}
}|	|k  r|}	|
|	k  r|j'                  t)        |||	              |j                  t#        |||	|
 j                                      |
}` |j'                  t)        ||d               | y t+        d      # 1 sw Y   xY w# 1 sw Y   xY wc c}}
}	w w)Nz  - r   z.tmx.rdc.xmlzdocument/entityentity_typeentity_mentionTYPENAMEzhead/charseq/startzhead/charseq/endz<(?!/?TEXT)[^>]+> c                 P    d| j                         | j                         z
  dz
  z  S )N    )endstart)ms    r   subfunczload_ace_file.<locals>.subfunc   s#    aeeg	)A-..r   z[\s\S]*<TEXT>z</TEXT>[\s\S]*z``z "z''z" r   r   r\   NE
multiclasszbad fmt value)ri   r   r   r|   openETrV   getrootfindallr	   textgetintra   readrt   subr   sortedextendr
   
ValueError)textfiler   annfileentitiesinfilexmlentitytypmentionrZ   er   r   entity_typesirj   s                   r   r   r      s    	Dx(+,
-.'G H	g )&hhv&&()++/0 )kk-(--~~&67 	)G{{6"f,GLL!56;;<AGLL!3499:Q>AOOQ3K(	)) 
h 6{{} 66%r40D/ 66"GT2D66#R.D 66$d#D66$d#D+344KQ3C4L4 hC}) 	IAq#1uAvKKd1Qi01KKT4!9??#456A	 	M$qr(+,
 
	C}) 	IAq#1uAvKKd1Qi01KKS$q)//"345A	 	M$qr(+,
 ))}) ) " 5sE   AM	$L('CM	5L5BM	
MEM	(L2-M	5L?:M	c           	      L   t         j                  |       } t         j                  |      }d}t        | |      D ]i  \  \  }}\  }}||cxk(  rdk(  r;n n8|rt        d|dd|dd|        t        dj	                  ddd             d}Rd}t        d|dd|dd|        k y )	NFr]   z  15r   z  {:15} {:15} {}z...T)rK   rX   zipri   format)correctguessedellipsiswctgts         r   
cmp_chunksr   .  s    ,,W5G,,W5GH1 ,B!R?s?2b'2b'1#./(//ueDEHBr"gQr"gQqc*+,r   c                   $    e Zd ZdZddZd Zd Zy)Maxent_NE_ChunkerrL   c                 Z    || _         t        d| d      | _        | j                          y )Nz+chunkers/maxent_ne_chunker_tab/english_ace_/)_fmtr	   _tab_dirload_params)r   r   s     r   r   zMaxent_NE_Chunker.__init__E  s+    	J3%qQRr   c                     ddl m}m}  || j                        \  }}}}t	         ||||      |      }t        |      | _        y )Nr   )BinaryMaxentFeatureEncodingload_maxent_params)alwayson_features)r   )nltk.classify.maxentr   r   r   r   r   rQ   )r   r   r   wgtmpglabaonmcs           r   r   zMaxent_NE_Chunker.load_paramsK  sB    X/>S#s'SCH#
 +b9r   c           	          ddl m} | j                  j                  }|j                  }|j
                  }|j                  }|j                  }|j                  }| j                  } |||||d| d       y )Nr   )save_maxent_paramsz/tmp/english_ace_r   )tab_dir)
r   r   rQ   _classifier	_encoding_weights_mapping_labels	_alwaysonr   )	r   r   classifecgr   r   r   r   r   s	            r   save_paramszMaxent_NE_Chunker.save_paramsT  sh    ;,,**llkkmmii3S#9J3%q7QRr   Nr   )rE   rF   rG   rH   r   r   r   rI   r   r   r   r   @  s    :
Sr   r   r   c                 <    t        |       }|j                          |S rN   )r   r   )r   chunkers     r   build_modelr   a  s    $GNr   __main__)r   Tr   )!rH   r   rt   	xml.etreer   r   nltk.tagr   r   nltk.classifyr   ImportErrornltk.chunk.apir   nltk.chunk.utilr   	nltk.datar	   nltk.tokenizer
   	nltk.treer   r   rK   r)   r7   r   r   r   r   r   r   rE   rI   r   r   <module>r      s    
 	 ' 3	. ( &  ' X/ Xv8L 8v ID*R,$S SB(T z }  		s   B BB