a
     †‹g¹  ã                   @   sR   d Z ddlmZmZ ddlmZ eddZg d¢efeee ee dœdd	„Zd
S )zÝ
nercut 0.2

Dictionary-based maximal matching word segmentation, constrained by
Thai Character Cluster (TCC) boundaries, and combining tokens that are
parts of the same named entity.

Code by Wannaphong Phatthiyaphaibun
é    )ÚIterableÚList)ÚNERZthainer)Zengine)ZORGANIZATIONZPERSONZPHONEZEMAILZDATEZTIME)ÚtextÚtaglistÚreturnc           
      C   s  t | tƒsg S |j| dd}g }d}t|ƒD ]æ\}\}}|dkrN|dd… }	nd}	| d¡rj|	|v rj|}n\| d¡rŽ|dkrŽ|	|v rŽ||7 }n8|dkr¸|dkr¸| |¡ d}| |¡ nd}| |¡ |d	 t|ƒkr,| d¡rô|dkrô| |¡ q,| d¡r,|dkr,| |¡ q,q,|S )
a‘  
    Dictionary-based maximal matching word segmentation, constrained by
    Thai Character Cluster (TCC) boundaries, and combining tokens that are
    parts of the same named-entity.

    :param str text: text to be tokenized into words
    :param list taglist: a list of named entity tags to be used
    :param class tagger: NER tagger engine
    :return: list of words, tokenized from the text
    F)ÚposÚ ÚOé   NzB-zI-é   )Ú
isinstanceÚstrÚtagÚ	enumerateÚ
startswithÚappendÚlen)
r   r   ZtaggerZtagged_wordsÚwordsZcombining_wordÚidxZ	curr_wordZcurr_tagr   © r   úC/usr/local/lib/python3.9/dist-packages/pythainlp/tokenize/nercut.pyÚsegment   s>    
ÿþý


r   N)	Ú__doc__Útypingr   r   Zpythainlp.tag.named_entityr   Z_thainerr   r   r   r   r   r   Ú<module>   s   	
ö	õ