a
     g 	                     @   sB   d Z ddlmZmZ ddlmZ G dd dZG dd deZdS )	z
Tokenzier classes for ULMFiT
    )
CollectionList)THAI2FIT_TOKENIZERc                   @   sD   e Zd ZdZedddZeee dddZee dd	d
Z	dS )BaseTokenizerz;Basic class for a tokenizer function. (codes from `fastai`)langc                 C   s
   || _ d S Nr   selfr    r   D/usr/local/lib/python3.9/dist-packages/pythainlp/ulmfit/tokenizer.py__init__   s    zBaseTokenizer.__init__)treturnc                 C   s
   | dS )N )split)r
   r   r   r   r   	tokenizer   s    zBaseTokenizer.tokenizer)toksc                 C   s   d S r   r   r
   r   r   r   r   add_special_cases   s    zBaseTokenizer.add_special_casesN)
__name__
__module____qualname____doc__strr   r   r   r   r   r   r   r   r   r      s   r   c                   @   s@   e Zd ZdZdedddZeeee dddZd	d
 Z	dS )ThaiTokenizerz
    Wrapper around a frozen newmm tokenizer to make it a
    :class:`fastai.BaseTokenizer`.
    (see: https://docs.fast.ai/text.transform#BaseTokenizer)
    thr   c                 C   s
   || _ d S r   r   r	   r   r   r   r       s    zThaiTokenizer.__init__)textr   c                 C   s
   t | S )u  
        This function tokenizes text using *newmm* engine and the dictionary
        specifically for `ulmfit` related functions
        (see: `Dictionary file (.txt)         <https://github.com/PyThaiNLP/pythainlp/blob/dev/pythainlp/corpus/words_th_thai2fit_201810.txt>`_).
        :meth: tokenize text using a frozen newmm engine
        :param str text: text to tokenize
        :return: tokenized text
        :rtype: list[str]

        :Example:

            Using :func:`pythainlp.ulmfit.ThaiTokenizer.tokenizer` is
            similar to :func:`pythainlp.tokenize.word_tokenize`
            using *ulmfit* engine.

            >>> from pythainlp.ulmfit import ThaiTokenizer
            >>> from pythainlp.tokenize import word_tokenize
            >>>
            >>> text = "อาภรณ์, จินตมยปัญญา ภาวนามยปัญญา"
            >>> ThaiTokenizer.tokenizer(text)
             ['อาภรณ์', ',', ' ', 'จิน', 'ตม', 'ย', 'ปัญญา',
             ' ', 'ภาวนามยปัญญา']
            >>>
            >>> word_tokenize(text, engine='ulmfit')
            ['อาภรณ์', ',', ' ', 'จิน', 'ตม', 'ย', 'ปัญญา',
             ' ', 'ภาวนามยปัญญา']

        )r   Zword_tokenize)r   r   r   r   r   #   s    zThaiTokenizer.tokenizerc                 C   s   d S r   r   r   r   r   r   r   D   s    zThaiTokenizer.add_special_casesN)r   )
r   r   r   r   r   r   staticmethodr   r   r   r   r   r   r   r      s
    r   N)r   typingr   r   Zpythainlp.tokenizer   r   r   r   r   r   r   <module>   s   