a
     g	                     @   s   d Z ddlZddlmZmZ dddddd	d
dddd Zed	eZ
eedddZeee dddZeee dddZdS )a  
The implementation of tokenizer according to Thai Character Clusters (TCCs)
rules proposed by `Theeramunkong et al. 2000.     <http://citeseerx.ist.psu.edu/viewdoc/summary?doi=10.1.1.59.2548>`_

Credits:
    * TCC: Jakkrit TeCho
    * Grammar: Wittawat Jitkrittum (`link to the source file       <https://github.com/wittawatj/jtcc/blob/master/TCC.g>`_)
    * Python code: Korakot Chaovavanich
    N)ListSetu  c[ั]([่-๋]c)?
c[ั]([่-๋]c)?k
เc็ck
เcctาะk
เccีtยะk
เccีtย(?=[เ-ไก-ฮ]|$)k
เc[ิีุู]tย(?=[เ-ไก-ฮ]|$)k
เcc็ck
เcิc์ck
เcิtck
เcีtยะ?k
เcืtอะk
เcื
เctา?ะ?k
c[ึื]tck
c[ะ-ู]tk
c[ิุู]์
cรรc์
c็
ct[ะาำ]?k
แc็ck
แcc์k
แctะk
แcc็ck
แccc์k
โctะk
[เ-ไ]ctk
ก็
อึ
หึ
ku   (cc?[d|ิ]?[์])?cu	   [ก-ฮ]tu
   [่-๋]?du   อูอุu   อ |)textreturnc                 c   sn   | rt | tsdS t| }d}||k rjt| |d }|rJ| d }nd}| |||  V  ||7 }qdS )z
    TCC generator which generates Thai Character Clusters

    :param str text: text to be tokenized into character clusters
    :return: subwords (character clusters)
    :rtype: Iterator[str]
    r   r   N   )
isinstancestrlen_PAT_TCCmatchspan)r
   Zlen_textpmn r   @/usr/local/lib/python3.9/dist-packages/pythainlp/tokenize/tcc.pytcc>   s    r   c                 C   sF   | rt | tst S t }d}t| D ]}|t|7 }|| q&|S )z
    TCC positions

    :param str text: text to be tokenized into character clusters
    :return: list of the ending position of subwords
    :rtype: set[int]
    r   )r   r   setr   r   add)r
   Zp_setr   wr   r   r   tcc_posU   s    r   c                 C   s   t t| S )z
    Subword segmentation

    :param str text: text to be tokenized into character clusters
    :return: list of subwords (character clusters), tokenized from the text
    :rtype: list[str]

    )listr   )r
   r   r   r   segmenti   s    
r   )__doc__retypingr   r   replacesplitZ_RE_TCCcompilejoinr   r   r   intr   r   r   r   r   r   <module>   s     "#$)