a
     g                     @   s   d Z ddlZddlmZ ddlmZmZ ddlmZm	Z	 ddl
mZ ddlmZ ddlmZ ed	Zed
ZdZdZdZdZee Zee Z[[[eeeee	e ddf dddZeeeeddf dddZedfeeee	e dddZdS )a  
Dictionary-based maximal matching word segmentation, constrained by
Thai Character Cluster (TCC) boundaries with improved rules.

The codes are based on the notebooks created by Korakot Chaovavanich,
with heuristic graph size limit added to avoid exponential waiting time.

:See Also:
    *         https://colab.research.google.com/notebook#fileId=1V1Z657_5eSWPo8rLfVRwA0A5E4vkg7SI
    *         https://colab.research.google.com/drive/14Ibg-ngZXj15RKwjNwoZlOT32fQBOrBx#scrollTo=MYZ7NzAR7Dmw
    N)defaultdict)heappopheappush)	GeneratorList)DEFAULT_WORD_DICT_TRIE)Trie)tcc_posz(?x)
[-a-zA-Z]+|        # Latin characters
\d+([,\.]\d+)*|    # numbers
[ \t]+|            # spaces
\r?\n|             # newlines
[^\u0E00-\u0E7F \t\r\n]+  # other non-Thai characters, and stops matching until space/newline
u   [ก-ฮ]{,2}$2   x      )graphstartgoalreturnc                 c   s\   ||gfg}|rX| d\}}| | D ].}||kr@||g V  q&||||g f q&qd S )Nr   )popappend)r   r   r   queueZvertexpathpos r   B/usr/local/lib/python3.9/dist-packages/pythainlp/tokenize/newmm.py_bfs_paths_graph:   s    r   )textcustom_dictr   c                 #   s  t t}d}t| t| }dg}d}|d |k rt|}|| |d  D ]L}|t| }	|	v rN|| |	 |d }|	|vrt||	 |tkrN qqNt|}
|
dkrt	t
|||d }d}|dd  D ] | |  V   }qq&|
dkr&t| |d  }|r||  }npt|d |D ]\  v r&|  d  } fdd||D }|rj } qt|r& } qq&|}|| | |d }| || V  t|| q&d S )Nr      c                    s*   g | ]"} t | v rt|s|qS r   )len_PAT_THAI_TWOCHARSmatch).0wordr   Z
valid_possr   r   
<listcomp>t   s   
z_onecut.<locals>.<listcomp>)r   listr	   r   r   prefixesr   r   _MAX_GRAPH_SIZEnextr   _PAT_NONTHAIr   endrange)r   r   r   Z
graph_sizeZlen_textZpos_listZend_posZ	begin_posr    Zend_pos_candidateZlen_pos_listZend_pos_candidatesmprefixwordsr   r!   r   _onecutG   s^    


r-   F)r   r   	safe_moder   c                 C   s4  | rt | tsg S |st}|r*t| tk r8tt| |S g }t| tkr| tt }t}|d}|dkrt|d }ndtt||}d}d}	t	|D ] \}
}t||	krt|}	|
}qt}t
d|D ]}
|t||
  }q|| d|  | |d } q<t| r||  g }|D ]}|tt|| q|S )a  Maximal-matching word segmentation constrained by Thai Character Cluster.

    A dictionary-based word segmentation using maximal matching algorithm,
    constrained by Thai Character Cluster boundaries.

    A custom dictionary can be supplied.

    :param text: text to be tokenized
    :type text: str
    :param custom_dict: tokenization dictionary,        defaults to DEFAULT_WORD_DICT_TRIE
    :type custom_dict: Trie, optional
    :param safe_mode: reduce chance for long processing time for long text        with many ambiguous breaking points, defaults to False
    :type safe_mode: bool, optional
    :return: list of tokens
    :rtype: List[str]
     r   r   N)
isinstancestrr   r   _TEXT_SCAN_ENDr#   r-   _TEXT_SCAN_BEGINrfind	enumerater)   r   extend)r   r   r.   Z
text_partssampleZcut_posZ	space_idxtokensZtoken_max_idxZtoken_max_lenitokenZ	text_partr   r   r   segment   s>    



r;   )__doc__recollectionsr   heapqr   r   typingr   r   Zpythainlp.tokenizer   Zpythainlp.utilr   Zpythainlp.tokenize.tcc_pr	   compiler'   r   r%   Z_TEXT_SCAN_POINTZ_TEXT_SCAN_LEFTZ_TEXT_SCAN_RIGHTr3   r2   intr   r1   r-   boolr;   r   r   r   r   <module>   s@   
H