a
     g#                     @   s   d Z ddlmZmZmZmZmZ ddlmZ ddl	Z
ddlmZ ddlmZ ddlmZ G dd	 d	Zeeeef eeee ee d
ddZe
je
jee eeeeef  dddZdS )z
Minimal re-implementation of KeyBERT.

KeyBERT is a minimal and easy-to-use keyword extraction technique
that leverages BERT embeddings to create keywords and keyphrases
that are most similar to a document.

https://github.com/MaartenGr/KeyBERT
    )ListOptionalIterableTupleUnion)CounterN)pipeline)thai_stopwords)word_tokenizec                   @   s~   e Zd ZdedddZdeeeef eeeeee  e	e
e e
eeef  f dddZe	ee
e f ejdddZd
S )KeyBERT-airesearch/wangchanberta-base-att-spm-uncased)
model_namec                 C   s   t d||dd| _d S )Nzfeature-extractionmain)	tokenizermodelrevision)r   ft_pipeline)selfr    r   E/usr/local/lib/python3.9/dist-packages/pythainlp/summarize/keybert.py__init__   s    zKeyBERT.__init__         r   newmmFN)textkeyphrase_ngram_rangemax_keywordsmin_dfr   
stop_wordsreturnc                 C   s   z|  }W n& ty2   tdt| dY n0 |s<g S |rD|nt }t|||||}	| |}
| |	}t|
||	|}|r|S dd |D S dS )uK  
        Extract Thai keywords and/or keyphrases with KeyBERT algorithm.
        See https://github.com/MaartenGr/KeyBERT.

        :param str text: text to be summarized
        :param Tuple[int, int] keyphrase_ngram_range: Number of token units to be defined as keyword.
                                The token unit varies w.r.t. `tokenizer_engine`.
                                For instance, (1, 1) means each token (unigram) can be a keyword (e.g. "เสา", "ไฟฟ้า"),
                                (1, 2) means one and two consecutive tokens (unigram and bigram) can be keywords
                                (e.g. "เสา", "ไฟฟ้า", "เสาไฟฟ้า")  (default: (1, 2))
        :param int max_keywords: Number of maximum keywords to be returned. (default: 5)
        :param int min_df: Minimum frequency required to be a keyword. (default: 1)
        :param str tokenizer: Name of tokenizer engine to use.
                                Refer to options in :func: `pythainlp.tokenize.word_tokenizer() (default: 'newmm')
        :param bool return_similarity: If `True`, return keyword scores. (default: False)
        :param Optional[Iterable[str]] stop_words: A list of stop words (a.k.a words to be ignored).
                                If not specified, :func:`pythainlp.corpus.thai_stopwords` is used. (default: None)

        :return: list of keywords with score

        :Example:
        ::

            from pythainlp.summarize.keybert import KeyBERT

            text = '''
                อาหาร หมายถึง ของแข็งหรือของเหลว
                ที่กินหรือดื่มเข้าสู่ร่างกายแล้ว
                จะทำให้เกิดพลังงานและความร้อนแก่ร่างกาย
                ทำให้ร่างกายเจริญเติบโต
                ซ่อมแซมส่วนที่สึกหรอ ควบคุมการเปลี่ยนแปลงต่างๆ ในร่างกาย
                ช่วยทำให้อวัยวะต่างๆ ทำงานได้อย่างปกติ
                อาหารจะต้องไม่มีพิษและไม่เกิดโทษต่อร่างกาย
            '''

            kb = KeyBERT()

            keywords = kb.extract_keyword(text)

            # output: ['อวัยวะต่างๆ',
            # 'ซ่อมแซมส่วน',
            # 'เจริญเติบโต',
            # 'ควบคุมการเปลี่ยนแปลง',
            # 'มีพิษ']

            keywords = kb.extract_keyword(text, max_keywords=10, return_similarity=True)

            # output: [('อวัยวะต่างๆ', 0.3228477063109462),
            # ('ซ่อมแซมส่วน', 0.31320597838000375),
            # ('เจริญเติบโต', 0.29115434699705506),
            # ('ควบคุมการเปลี่ยนแปลง', 0.2678430841321016),
            # ('มีพิษ', 0.24996827960821494),
            # ('ทำให้ร่างกาย', 0.23876962942443258),
            # ('ร่างกายเจริญเติบโต', 0.23191285218852364),
            # ('จะทำให้เกิด', 0.22425422716846247),
            # ('มีพิษและ', 0.22162962875299588),
            # ('เกิดโทษ', 0.20773497763458507)]

        zUnable to process data of type z&. Please provide input of string type.c                 S   s   g | ]\}}|qS r   r   ).0kw_r   r   r   
<listcomp>       z,KeyBERT.extract_keywords.<locals>.<listcomp>N)stripAttributeErrortyper	   _generate_ngramsembed_rank_keywords)r   r   r   r   r   r   Zreturn_similarityr    Zstop_words_Zkw_candidates
doc_vectorZ
kw_vectorskeywordsr   r   r   extract_keywords"   s(    E



zKeyBERT.extract_keywords)docsr!   c                 C   sL   |  |}t|ts t|dkr4t|jdd}ntdd |D }|S )zn
        Create an embedding of each input in `docs` by averaging vectors from the last hidden layer.
        r   Zaxisc                 S   s"   g | ]}t |d  jd dqS )r   r1   )nparraymean)r"   Zembr   r   r   r%      r&   z!KeyBERT.embed.<locals>.<listcomp>)r   
isinstancestrlenr2   r3   r4   stack)r   r0   ZembsZemb_meanr   r   r   r+      s    
zKeyBERT.embed)r   )r   r   r   r   FN)__name__
__module____qualname__r6   r   r   intr   r   r   r   floatr/   r2   ndarrayr+   r   r   r   r   r      s(          

dr   )docr   r   tokenizer_enginer    r!   c                    s   |d dksJ d| d|d |d ks<J d| dt tttf  t t ddd}t| |d	g }|d |d d f}t| D ]h}|dkrd
d D }	n"tfddt|D  }
||
}	t|	} fdd| D }	||	 q|S )Nr   r   z9`keyphrase_ngram_range` must start from 1. current value=.z^The value first argument of `keyphrase_ngram_range` must not exceed the second. current value=)ngramsr!   c                 S   s2   g }| D ]$}d |}| |kr|| q|S )N )joinr'   append)rB   Zngrams_joinedZngZjoinedr   r   r   _join_ngram   s    
z%_generate_ngrams.<locals>._join_ngram)Zenginec                 S   s   g | ]}|  r|qS r   )r'   )r"   wordr   r   r   r%      r&   z$_generate_ngrams.<locals>.<listcomp>c                    s   g | ]} |d  qS )Nr   )r"   i)wordsr   r   r%      r&   c                    s$   g | ]\}}| kr|vr|qS r   r   )r"   rG   freq)r   r    r   r   r%      s   )	r   r   r6   r
   rangezipr   itemsextend)r?   r   r   r@   r    rF   Z	all_gramsZngram_rangenrB   Zngrams_tupleZ
ngrams_cntr   )r   r    rI   r   r*      s2     	r*   )r-   word_vectorsr.   r   r!   c                    sr   t jt jddd}t jt jt jddd}|| } ||}|| | t   } fdd|d | D }|S )	N)vr!   c                 S   s\   | j d }t| tjj| ddddj|dd}ttjj|ddd sXJ d|S )Nr   r1   z)Cannot normalize a vector to unit vector.)	shaper2   divideZlinalgZnormZreshaperepeatiscloseall)rQ   Zvec_sizeresultr   r   r   l2_norm   s    
 
z_rank_keywords.<locals>.l2_norm)abr!   c                 S   s   t | |jjjddS )Nr   r1   )r2   matmulTsum)rZ   r[   r   r   r   
cosine_sim   s    z"_rank_keywords.<locals>.cosine_simc                    s   g | ]}|  | fqS r   r   )r"   rZcosine_simsr.   r   r   r%      s   z"_rank_keywords.<locals>.<listcomp>)r2   r>   Zargsort)r-   rP   r.   r   rY   r_   Zranking_descZfinal_ranksr   ra   r   r,      s    

r,   )__doc__typingr   r   r   r   r   collectionsr   numpyr2   Ztransformersr   Zpythainlp.corpusr	   Zpythainlp.tokenizer
   r   r6   r<   r*   r>   r=   r,   r   r   r   r   <module>   s*   	 
1