a
     gD.                  	   @   sH  d Z ddlmZ ddlmZ ddlmZmZmZm	Z	m
Z
mZmZmZmZ ddlmZmZ ddlmZ ddlmZ eedd	d
ZeedddZeeef eeeeegef edddZeee dddZeee dddZeeeef e	e e	eeef  f eeeeeegef  e
eeef  dddZG dd dZdS )z
Spell checker, using Peter Norvig algorithm.
Spelling dictionary can be customized.
Default spelling dictionary is based on Thai National Corpus.

Based on Peter Norvig's Python code from http://norvig.com/spell-correct.html
    )Counter)digits)	CallableDict	ItemsViewIterableListOptionalSetTupleUnion)thai_digitsthai_letters)tnc)
isthaicharwordreturnc                 C   s   dS )NT r   r   r   </usr/local/lib/python3.9/dist-packages/pythainlp/spell/pn.py
_no_filter   s    r   c                 C   s:   | D ]0}|dkrt |s dS |tv s.|tv r dS qdS )N.FT)r   r   r   )r   chr   r   r   _is_thai_and_not_num    s    r   )	word_freqmin_freqmin_lenmax_lendict_filterr   c                 C   sT   | r| d |k rdS | d }|rH|t |  kr8|krHn n|d dksLdS ||S )z
    Checks whether a given word has the required minimum frequency min_freq
    and its character length is between min_len and max_len (inclusive).
       Fr   r   len)r   r   r   r   r   r   r   r   r   _keep)   s    ,r#   c                    sj    fddt t d D }dd |D }dd |D }dd |D }dd |D }t|| | | S )zO
    Returns a set of words with an edit distance of 1 from the input word
    c                    s$   g | ]} d |  |d  fqS Nr   ).0ir   r   r   
<listcomp>B       z_edits1.<locals>.<listcomp>r    c                 S   s$   g | ]\}}|r||d d  qS r    Nr   r%   LRr   r   r   r'   C   r(   c                 S   s<   g | ]4\}}t |d kr||d   |d  |dd  qS )r    r      Nr!   r*   r   r   r   r'   D   r(   c                 S   s2   g | ]*\}}|rt D ]}|| |d d  qqS r)   r   r%   r+   r,   cr   r   r   r'   E   r(   c                 S   s&   g | ]\}}t D ]}|| | qqS r   r.   r/   r   r   r   r'   F   r(   )ranger"   set)r   splitsZdeletesZ
transposesZreplacesinsertsr   r   r   _edits1>   s    r5   c                 C   s   t dd t| D S )zO
    Returns a set of words with an edit distance of 2 from the input word
    c                 s   s    | ]}t |D ]
}|V  qqd S r$   )r5   )r%   e1e2r   r   r   	<genexpr>O   r(   z_edits2.<locals>.<genexpr>)r2   r5   r   r   r   r   _edits2K   s    r9   )custom_dictr   r   r   r   r   c                    sx   t | trt|  } t| }t|}t |trH fdd| D } n,t |trl fdd| D } ntd| S )zE
    Converts a custom dictionary to a list of (str, int) tuples
    c                    s(   g | ] }t |d fd  r|d fqS )r    r#   )r%   r   )r   r   r   r   r   r'   f   s   z(_convert_custom_dict.<locals>.<listcomp>c                    s    g | ]}t | r|qS r   r;   )r%   r   r   r   r   r   r   r   r'   m   s   zVcustom_dict must be either Dict[str, int], Iterable[Tuple[str, int]], or Iterable[str])	
isinstancedictlistitemsiternextstrtuple	TypeError)r:   r   r   r   r   r&   Zfirst_memberr   r<   r   _convert_custom_dictR   s     


rF   c                	   @   s   e Zd Zddddefeeeef ee ee	eef  f eeee
eegef  dddZeeef ddd	Zee ee d
ddZeedddZeedddZeee dddZeedddZdS )NorvigSpellCheckerNr-   (   )r:   r   r   r   r   c                 C   s`   |sdd t  D }|st}t|||||}tt|| _|  jt 7  _t| j | _	dS )a_  
        Initializes Peter Norvig's spell checker object.
        Spelling dictionary can be customized.
        By default, spelling dictionary is from
        `Thai National Corpus <http://www.arts.chula.ac.th/ling/tnc/>`_

        Basically, Norvig's spell checker will choose the most likely
        corrected spelling given a word by searching for candidates of
        corrected words based on edit distance.
        Then, it selects the candidate with
        the highest word occurrence probability.

        :param str custom_dict: A custom spelling dictionary. This can be:
                                (1) a dictionary (`dict`), with words (`str`)
                                    as keys and frequencies (`int`) as values;
                                (2) an iterable (list, tuple, or set) of words
                                    (`str`) and frequency (`int`) tuples:
                                    `(str, int)`; or
                                (3) an iterable of just words (`str`), without
                                    frequencies -- in this case `1` will be
                                    assigned to every words.
                                Default is from Thai National Corpus (around
                                40,000 words).
        :param int min_freq: Minimum frequency of a word to keep (default = 2)
        :param int min_len: Minimum length (in characters) of a word to keep
                            (default = 2)
        :param int max_len: Maximum length (in characters) of a word to keep
                            (default = 40)
        :param func dict_filter: A function to filter the dictionary.
                                 Default filter removes any word
                                 with numbers or non-Thai characters.
                                 If no filter is required, use None.
        c                 S   s   g | ]\}}||fqS r   r   )r%   r&   jr   r   r   r'      r(   z/NorvigSpellChecker.__init__.<locals>.<listcomp>N)
r   Z
word_freqsr   rF   r   r>   _NorvigSpellChecker__WORDSsumvalues _NorvigSpellChecker__WORDS_TOTAL)selfr:   r   r   r   r   r   r   r   __init__|   s    +
zNorvigSpellChecker.__init__)r   c                 C   s
   | j  S )u3  
        Returns the spelling dictionary currently used by this spell checker

        :return: spelling dictionary of this instance
        :rtype: list[tuple[str, int]]

        :Example:
        ::

            from pythainlp.spell import NorvigSpellChecker

            dictionary= [("หวาน", 30), ("มะนาว", 2), ("แอบ", 3223)]

            checker = NorvigSpellChecker(custom_dict=dictionary)
            checker.dictionary()
            # output: dict_items([('หวาน', 30), ('มะนาว', 2), ('แอบ', 3223)])
        )rJ   r@   rN   r   r   r   
dictionary   s    zNorvigSpellChecker.dictionary)wordsr   c                    s   t  fdd|D S )u  
        Returns a list of given words found in the spelling dictionary

        :param list[str] words: A list of words to check if they exist
                                in the spelling dictionary

        :return: intersection of the given word list and words
                 in the spelling dictionary
        :rtype: list[str]

        :Example:
        ::

            from pythainlp.spell import NorvigSpellChecker

            checker = NorvigSpellChecker()

            checker.known(["เพยน", "เพล", "เพลง"])
            # output: ['เพล', 'เพลง']

            checker.known(['ยกไ', 'ไฟล์ม'])
            # output: []

            checker.known([])
            # output: []
        c                 3   s   | ]}| j v r|V  qd S r$   rJ   )r%   wrP   r   r   r8      r(   z+NorvigSpellChecker.known.<locals>.<genexpr>)r?   )rN   rR   r   rP   r   known   s    zNorvigSpellChecker.knownr   c                 C   s   | j | | j S )ui  
        Returns the probability of an input word,
        according to the spelling dictionary

        :param str word: A word to check occurrence probability of

        :return: word occurrence probability
        :rtype: float

        :Example:
        ::

            from pythainlp.spell import NorvigSpellChecker

            checker = NorvigSpellChecker()

            checker.prob("ครัช")
            # output: 0.0

            checker.prob("รัก")
            # output: 0.0006959172792052158

            checker.prob("น่ารัก")
            # output: 9.482306849763902e-05
        )rJ   rM   rN   r   r   r   r   prob   s    zNorvigSpellChecker.probc                 C   s
   | j | S )u  
        Returns the frequency of an input word,
        according to the spelling dictionary

        :param str word: A word to check frequency of
        :return: frequency of the given word in the spelling dictionary
        :rtype: int

        :Example:
        ::

            from pythainlp.spell import NorvigSpellChecker

            checker = NorvigSpellChecker()

            checker.freq("ปัญญา")
            # output: 3639

            checker.freq("บิญชา")
            # output: 0
        rS   rV   r   r   r   freq  s    zNorvigSpellChecker.freqc                 C   sL   |s
dgS |  |gp6|  t|p6|  t|p6|g}|j| jdd |S )u  
        Returns a list of all correctly-spelled words whose spelling
        is similar to the given word by edit distance metrics.
        The returned list of words will be sorted by decreasing
        order of word frequencies in the word spelling dictionary.

        First, if the input word is spelled correctly,
        this method returns a list of exactly one word which is itself.
        Next, this method looks for a list of all correctly spelled words
        whose edit distance value is 1 from the input word.
        If there is no such word, then the search expands to
        a list of words whose edit distance value is 2.
        And if that still fails, the list of input words is returned.

        :param str word: A word to check spelling of

        :return: list of possibly correct words within 1 or 2 edit distance
                 and sorted by frequency of word occurrence in the
                 spelling dictionary in descending order.
        :rtype: list[str]

        :Example:
        ::

            from pythainlp.spell import NorvigSpellChecker

            checker = NorvigSpellChecker()

            checker.spell("เส้นตรบ")
            # output: ['เส้นตรง']

            checker.spell("ครัช")
            # output: ['ครับ', 'ครัว', 'รัช', 'ครัม', 'ครัน',
            # 'วรัช', 'ครัส', 'ปรัช', 'บรัช', 'ครัง',
            #'คัช', 'คลัช', 'ครัย', 'ครัด']
         T)keyreverse)rU   r5   r9   sortrX   )rN   r   
candidatesr   r   r   spell  s    %zNorvigSpellChecker.spellc                 C   sJ   |sdS z d|v rt | nt| |W S  ty:   Y n0 | |d S )u  
        Returns the most possible word, using the probability from
        the spelling dictionary

        :param str word: A word to correct spelling of

        :return: the correct spelling of the given word
        :rtype: str

        :Example:
        ::

            from pythainlp.spell import NorvigSpellChecker

            checker = NorvigSpellChecker()

            checker.correct("ปัญชา")
            # output: 'ปัญหา'

            checker.correct("บิญชา")
            # output: 'บัญชา'

            checker.correct("มิตรภาบ")
            # output: 'มิตรภาพ'
        rY   r   r   )floatint
ValueErrorr^   rV   r   r   r   correctM  s    
zNorvigSpellChecker.correct)__name__
__module____qualname__r   r   r   rC   r`   r   r   r	   r   boolrO   r   rQ   r   rU   r_   rW   rX   r^   rb   r   r   r   r   rG   {   s(    :2rG   N) __doc__collectionsr   stringr   typingr   r   r   r   r   r	   r
   r   r   Z	pythainlpr   r   Zpythainlp.corpusr   Zpythainlp.utilr   rC   rf   r   r   r`   r#   r5   r9   rF   rG   r   r   r   r   <module>   s6   ,

 )