a
     g                     @   s   d Z ddlZddlmZmZ ddlmZ ddlmZ ddl	m
Z
 g dZg dZd	d
gZedZdZdZG dd dZefee
ee dddZdS )z
Dictionary-based longest-matching Thai word segmentation. Implementation is based
on the codes from Patorn Utenpattanun.

:See Also:
    * `GitHub Repository        <https://github.com/patorn/thaitokenizer/blob/master/thaitokenizer/tokenizer.py>`_

    N)ListUnion)thai_tonemarks)DEFAULT_WORD_DICT_TRIE)Trie)u   ะ   ัu   า u   ำu   ิu   ีu   ึ   ืu   ุu   ูu   ๅu   ็u   ์   ํ)r   r   u   เu   แu   โu   ใu   ไr	   u   ๆu   ฯz[A-Za-z\d]*TFc                   @   s|   e Zd ZedddZeeedef dddZee	e
dd	d
Zee	edddZedddZeee dddZdS )LongestMatchTokenizer)triec                 C   s
   || _ d S N)_LongestMatchTokenizer__trie)selfr    r   D/usr/local/lib/python3.9/dist-packages/pythainlp/tokenize/longest.py__init__.   s    zLongestMatchTokenizer.__init__N)textreturnc                 C   s&   t | }|dr"|d S d S )Nr   )_RE_NONTHAIsearchgrouplower)r   matchr   r   r   Z__search_nonthai1   s    

z&LongestMatchTokenizer.__search_nonthai)r   	begin_posr   c                 C   s\   ||d    }|sdS | |}|r*dS tt|d D ]}|d| | jv r: dS q:dS )NT   r   F)strip&_LongestMatchTokenizer__search_nonthairangelenr   )r   r   r   r   posr   r   r   Z__is_next_word_valid8   s    
z*LongestMatchTokenizer.__is_next_word_validc           	      C   s   ||d  }|  |}|r|S d }d }tt|d D ].}|d| }|| jv r6|}| ||r6|}q6|r|sr|}z0t|}|| tv r|d|d  W S |W S W q ty   | Y S 0 ndS d S )Nr   r    )r   r   r   r   *_LongestMatchTokenizer__is_next_word_valid_TRAILING_CHARBaseException)	r   r   r   r   wordZ
word_validr   wZlen_word_validr   r   r   Z__longest_matchingH   s.    


z(LongestMatchTokenizer.__longest_matching)r   c                 C   s
  d}t |}g }g }||k r| ||}|s|dkr||  s|| tv sz||d  tv sz|| tv sz|r|d tkr|d  || 7  < t|d< n|||  |t |d7 }q|dkr||d  tv r|d  |7  < n|| |t |t |7 }q|S )Nr   r   )	r   (_LongestMatchTokenizer__longest_matchingisspace_FRONT_DEP_CHAR_REAR_DEP_CHARr   _UNKNOWNappend_KNOWN)r   r   r   Zlen_texttokensZtoken_statusesr   r   r   r   Z	__segmenth   s@    










zLongestMatchTokenizer.__segmentc                 C   s   |  |}|S r   )_LongestMatchTokenizer__segment)r   r   r.   r   r   r   tokenize   s    
zLongestMatchTokenizer.tokenize)__name__
__module____qualname__r   r   staticmethodstrr   r   intboolr!   r'   r/   r   r0   r   r   r   r   r
   -   s    "r
   )r   custom_dictr   c                 C   s(   | rt | tsg S |st}t|| S )z
    Dictionary-based longest matching word segmentation.

    :param str text: text to be tokenized into words
    :param pythainlp.util.Trie custom_dict: dictionary for tokenization
    :return: list of words, tokenized from the text
    )
isinstancer5   r   r
   r0   )r   r8   r   r   r   segment   s
    
r:   )__doc__retypingr   r   Z	pythainlpr   Zpythainlp.tokenizer   Zpythainlp.utilr   r)   r*   r"   compiler   r-   r+   r
   r5   r:   r   r   r   r   <module>   s"   	
c