a
     g                     @   s   d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
 dZdZdZdZG d	d
 d
ZG dd deZG dd deZG dd deZG dd dZdS )z*
Command line for PyThaiNLP's tokenizers.
    N)cli)DEFAULT_SENT_TOKENIZE_ENGINEDEFAULT_SUBWORD_TOKENIZE_ENGINEDEFAULT_WORD_TOKENIZE_ENGINEsent_tokenizesubword_tokenizeword_tokenizez@@/~|c                   @   s   e Zd Zdd ZdS )
SubAppBasec                 C   s   t jf i td| }|jdtddd |jdddtd	| j | jd
 |jdddtd	| j | jd
 |jddddd |jddddd |jdd |	|}|| _
t|j| | j|j|j|jd}t|j||j  d S )Nz	tokenize text?z
input text)typenargshelpz-sz--sep	separatorz	default: )destr   r   defaultz-az--algo	algorithmz-wz--keep-whitespacekeep_whitespace
store_true)r   actionz-nwz--no-whitespacestore_falseT)r   )Zenginer   )argparseArgumentParserr   Z
make_usageadd_argumentstrr   r   set_defaults
parse_argsargsexit_if_emptyr   runr   printjoin)selfnameargvparserr    result r*   @/usr/local/lib/python3.9/dist-packages/pythainlp/cli/tokenize.py__init__   sZ    


zSubAppBase.__init__N__name__
__module____qualname__r,   r*   r*   r*   r+   r      s   r   c                       s   e Zd Z fddZ  ZS )WordTokenizationAppc                    s.   d| _ t| _t| _t| _t j|i | d S NT)	r   r   r   DEFAULT_WORD_TOKEN_SEPARATORr   r   r"   superr,   r%   r    kwargs	__class__r*   r+   r,   N   s
    zWordTokenizationApp.__init__r.   r/   r0   r,   __classcell__r*   r*   r7   r+   r1   M   s   r1   c                       s   e Zd Z fddZ  ZS )SentenceTokenizationAppc                    s.   d| _ t| _t| _t| _t j|i | d S r2   )	r   r   r   DEFAULT_SENT_TOKEN_SEPARATORr   r   r"   r4   r,   r5   r7   r*   r+   r,   W   s
    z SentenceTokenizationApp.__init__r9   r*   r*   r7   r+   r;   V   s   r;   c                       s   e Zd Z fddZ  ZS )SubwordTokenizationAppc                    s.   d| _ t| _t| _t| _t j|i | d S r2   )	r   r   r   DEFAULT_SUBWORD_TOKEN_SEPARATORr   r   r"   r4   r,   r5   r7   r*   r+   r,   `   s
    zSubwordTokenizationApp.__init__r9   r*   r*   r7   r+   r=   _   s   r=   c                   @   s   e Zd Zdd ZdS )Appc                 C   s   t jdddd}|jdtdd ||dd	 }t|j| t|j}|d	d  }|	d
rnt
d| n:|	drtd| n$|	drtd| ntd|  d S )Ntokenizez'Break a text into small units (tokens).u  thainlp tokenize <token_type> [options] "<text>"

token_type:

subword            subword (may not be a linguistic unit)
syllable           syllable
word               word
sent               sentence

options:

--sep or -s <separator>    specify custom separator
                           (default is a space)
--algo or -a <algorithm>   tokenization algorithm
                           (see API doc for more info)
--keep-whitespace or -w    keep whitespaces in output
                           (default)

<separator> and <text> should be inside double quotes.

Example:

thainlp tokenize word -s "|" "ใต้แสงนีออนเปลี่ยวเหงา"

--)progdescriptionusage
token_typez[subword|word|sent])r   r         wwordZsuZsubwordsesentzToken type not available: )r   r   r   r   r   r   r!   rD   lower
startswithr1   r=   r;   r#   )r%   r'   r(   r    rD   r*   r*   r+   r,   i   s*    


zApp.__init__Nr-   r*   r*   r*   r+   r?   h   s   r?   )__doc__r   Z	pythainlpr   Zpythainlp.tokenizer   r   r   r   r   r   r<   r>   Z DEFAULT_SYLLABLE_TOKEN_SEPARATORr3   r   r1   r;   r=   r?   r*   r*   r*   r+   <module>   s    	3			