a
     g}                     @   s   d Z ddlZddlmZ ddlmZ ddlmZ ddlm	Z	 e	eddd	Z
d
e dZeeZee ee dddZeee dddZdS )a[  
Segmenting text into Enhanced Thai Character Clusters (ETCCs)
Python implementation by Wannaphong Phatthiyaphaibun

This implementation relies on a dictionary of ETCC created from etcc.txt
in pythainlp/corpus.

Notebook:
https://colab.research.google.com/drive/1UTQgxxMRxOr9Jp1B1jcq1frBNvorhtBQ

:See Also:

Jeeragone Inrut, Patiroop Yuanghirun, Sarayut Paludkong, Supot Nitsuwat, and
Para Limmaneepraserth. "Thai word segmentation using combination of forward
and backward longest matching techniques." In International Symposium on
Communications and Information Technology (ISCIT), pp. 37-40. 2001.
    N)List)thai_follow_vowels)
get_corpus)	Tokenizerzetcc.txtlongest)Zengine[u   ๆฯ])tokensreturnc                 C   sp   t | }d}||krqlt| | rb|dkrbt | | dkrb| |d   | | 7  < | |= |d8 }|d7 }q| S )Nr      )len_RE_ENDING_CHARsearch)r   Z
len_tokensi r   A/usr/local/lib/python3.9/dist-packages/pythainlp/tokenize/etcc.py_cut_subword!   s    &
r   )textr	   c                 C   s    | rt | tsg S tt| S )aU  
    Segmenting text into ETCCs.

    Enhanced Thai Character Cluster (ETCC) is a kind of subword unit.
    The concept was presented in Inrut, Jeeragone, Patiroop Yuanghirun,
    Sarayut Paludkong, Supot Nitsuwat, and Para Limmaneepraserth.
    "Thai word segmentation using combination of forward and backward
    longest matching techniques." In International Symposium on Communications
    and Information Technology (ISCIT), pp. 37-40. 2001.

    :param str text: text to be tokenized into character clusters
    :return: list of clusters, tokenized from the text
    :return: List[str]
    )
isinstancestrr   	_cut_etccZword_tokenize)r   r   r   r   segment/   s    r   )__doc__retypingr   Z	pythainlpr   Zpythainlp.corpusr   Zpythainlp.tokenizer   r   Z_PAT_ENDING_CHARcompiler   r   r   r   r   r   r   r   <module>   s   
