a
     g&                     @   s<  d Z ddlZddlZddlmZmZ ddlZdZdZdZ	dZ
dZeed	d
dZeed	ddZeed	ddZeed	ddZeed	ddZee ee dddZeed	ddZeed	ddZee ee dddZee ee dddZeed	dd Zee ee dd!d"Zee ee dd#d$ZdS )%z
Preprocessing for ULMFiT
    N)
CollectionListZxxunkZxxrepZxxwrepZxxendZxxurl)textreturnc                 C   s   d}t |t| S )a  
    Replace URL in `text` with TK_URL

    :param str text: text to replace URL in

    :return: text with URLs replaced
    :rtype: str

    :Example:

        >>> from pythainlp.ulmfit import replace_url
        >>> replace_url("go to github.com")
        go to xxurl
    u  (?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@))))resub_TK_URL)r   ZURL_PATTERN r	   E/usr/local/lib/python3.9/dist-packages/pythainlp/ulmfit/preprocess.pyreplace_url   s    r   c                 C   s   t d}| ddddddddd	d
dddddddddtdddddddd} |dt| S )a2  
    Replace HTML strings in `test`. (codes from `fastai`)

    :param str text: text to replace HTML strings in

    :return: text with HTML strings replaced
    :rtype: str

    :Example:

        >>> from pythainlp.ulmfit import fix_html
        >>> fix_html("Anbsp;amp;nbsp;B @.@ ")
        A & B.
    z  +z#39;'zamp;&z#146;znbsp; z#36;$z\n
zquot;z<br />z\""z<unk>z @.@ .z @-@ -z @,@ ,\z \ )r   compilereplace_TK_UNKr   htmlunescape)r   Zre1r	   r	   r
   fix_html'   s<    
	
r   c                 C   s   t dd| S )z7Remove multiple spaces in `text`. (codes from `fastai`)z {2,}r   r   r   r   r	   r	   r
   rm_useless_spacesJ   s    r   c                 C   s   t dd| S )z<Add spaces around / and # in `text`. 
 (codes from `fastai`)z([/#\n])z \1 r   r   r	   r	   r
   spec_add_spacesO   s    r   c                 C   s   dd }t d}||| S )u  
    Replace repetitions at the character level in `text` after the repeated character.
    This is to prevent cases such as 'น้อยยยยยยยย' becomes 'น้อ xxrep 8 ย'
    ; instead it will retain the word as 'น้อย xxrep 8'

    :param str text: input text to replace character repetitions in

    :return: text with repetitive token **xxrep** and the counter
             after the repeated character

    :rtype: str
    :Example:

        >>> from pythainlp.ulmfit import replace_rep_after
        >>>
        >>> text = "กาาาาาาา"
        >>> replace_rep_after(text)
        'กาxxrep7 '
    c                 S   s&   |   \}}| t t|d  dS )N   r   )groups_TK_REPlen)mcccr	   r	   r
   _replace_repi   s    z'replace_rep_after.<locals>._replace_rep(\S)(\1{3,})r   r   r   r   r'   Zre_repr	   r	   r
   replace_rep_afterT   s    
r+   )toksr   c                 C   sp   d}d}g }| t g D ]L}||kr,|d7 }n2||k|dk@ rT|tt||g7 }d}n
|| |}q|dd S )u%  
    Replace repetitive words after tokenization;
    fastai `replace_wrep` does not work well with Thai.

    :param list[str] toks: list of tokens

    :return: list of tokens where **xxwrep** token and the counter
             is added before repetitive words.
    :rtype: list[str]

    :Example:

        >>> from pythainlp.ulmfit import replace_wrep_post_nonum
        >>>
        >>> toks = ["กา", "น้ำ", "น้ำ", "น้ำ", "น้ำ"]
        >>> replace_wrep_post(toks)
        ['กา', 'xxwrep', '3', 'น้ำ']

    Nr   r    )_TK_END_TK_WREPstrappendr,   Zprevious_wordZ	rep_countresZcurrent_wordr	   r	   r
   replace_wrep_postr   s    

r3   c                 C   s   t dd| S )z#Remove multiple newlines in `text`.z[\n]{2,}r   r   r   r	   r	   r
   rm_useless_newlines   s    r4   c                 C   s   t dd| }t dd|}t dd|}t dd|}t dd|}t dd|}t dd|}t d	d|}t d
d|}t dd|}t dd|}t dd|}|S )zDRemove all empty brackets and artifacts within brackets from `text`.z\(\) z\{\}z\[\]u   \([^a-zA-Z0-9ก-๙]+\)u   \{[^a-zA-Z0-9ก-๙]+\}u   \[[^a-zA-Z0-9ก-๙]+\]u1   (?<=\()[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])u1   (?<=\{)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])u1   (?<=\[)[^a-zA-Z0-9ก-๙]+(?=[a-zA-Z0-9ก-๙])u1   (?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\))u1   (?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\})u1   (?<=[a-zA-Z0-9ก-๙])[^a-zA-Z0-9ก-๙]+(?=\])r   )r   new_liner	   r	   r
   rm_brackets   s2    r7   c                 C   s>   g }| D ]0}t |t|kr.|t| q|| q|S )zd
    Ungroup Zero Width Joiner (ZVJ) Emojis

    See https://emojipedia.org/emoji-zwj-sequence/
    )emojiZemoji_countr#   extendlistr0   )r,   r2   tokr	   r	   r
   ungroup_emoji   s    r<   c                 C   s   dd | D S )zt
    Lowercase all English words;
    English words in Thai texts don't usually have nuances of capitalization.
    c                 S   s   g | ]}|  qS r	   )lower).0r;   r	   r	   r
   
<listcomp>       z!lowercase_all.<locals>.<listcomp>r	   )r,   r	   r	   r
   lowercase_all   s    rA   c                 C   s   dd }t d}||| S )ul  
    Replace repetitions at the character level in `text` after the repetition.
    This is done to prevent such case as 'น้อยยยยยยยย' becoming 'น้อ xxrep ย';
    instead it will retain the word as 'น้อย xxrep '

    :param str text: input text to replace character repetition

    :return: text with repetitive token **xxrep** after
             character repetition
    :rtype: str

    :Example:

        >>> from pythainlp.ulmfit import replace_rep_nonum
        >>>
        >>> text = "กาาาาาาา"
        >>> replace_rep_nonum(text)
        'กา xxrep '

    c                 S   s   |   \}}| dt dS )Nr   )r!   r"   )r$   r%   _r	   r	   r
   r'      s    z'replace_rep_nonum.<locals>._replace_repr(   r)   r*   r	   r	   r
   replace_rep_nonum   s    
rC   c                 C   sj   d}d}g }| t g D ]F}||kr,|d7 }n,||k|dk@ rN|t|g7 }d}n
|| |}q|dd S )u  
    Replace reptitive words post tokenization;
    fastai `replace_wrep` does not work well with Thai.

    :param list[str] toks: list of tokens

    :return: list of tokens where **xxwrep** token is added in front of
             repetitive words.
    :rtype: list[str]

    :Example:

        >>> from pythainlp.ulmfit import replace_wrep_post_nonum
        >>>
        >>> toks = ["กา", "น้ำ", "น้ำ", "น้ำ", "น้ำ"]
        >>> replace_wrep_post_nonum(toks)
        ['กา', 'xxwrep', 'น้ำ']

    Nr   r    )r-   r.   r0   r1   r	   r	   r
   replace_wrep_post_nonum   s    

rD   c                 C   s(   g }| D ]}|  }|r|| q|S )z
    Do not include space for bag-of-word models.

    :param list[str] toks: list of tokens

    :return: list of tokens where space tokens (" ") are filtered out
    :rtype: list[str]
    )stripr0   )r,   r2   tr	   r	   r
   remove_space  s    	rG   )__doc__r   r   typingr   r   r8   r   r"   r.   r-   r   r/   r   r   r   r   r+   r3   r4   r7   r<   rA   rC   rD   rG   r	   r	   r	   r
   <module>   s,   ##!#