U
    ڲgDN                     @  sb  d dl mZ d dlmZ d dlmZ ddlmZmZm	Z	 ddl
mZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ G dd dZG d	d
 d
eZG dd deZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$G dd deZ%eddddddd d!Z&ed"dd+d%d&dd&d'd(d)Z'd*S ),    )annotations)	lru_cache)	getLogger   )COMMON_SAFE_ASCII_CHARACTERSTRACEUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuated	is_arabicis_arabic_isolated_formis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thaiis_unprintableremove_accentunicode_rangec                   @  sP   e Zd ZdZdddddZddddd	Zdd
ddZedd
ddZdS )MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    strbool	characterreturnc                 C  s   t dS )z@
        Determine if given character should be fed in.
        NNotImplementedErrorselfr    r$   9/tmp/pip-unpacked-wheel-y7coycdt/charset_normalizer/md.pyeligible&   s    zMessDetectorPlugin.eligibleNonec                 C  s   t dS )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        Nr    r"   r$   r$   r%   feed,   s    zMessDetectorPlugin.feedr   c                 C  s   t dS )zB
        Permit to reset the plugin to the initial state.
        Nr    r#   r$   r$   r%   reset3   s    zMessDetectorPlugin.resetfloatc                 C  s   t dS )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        Nr    r*   r$   r$   r%   ratio9   s    zMessDetectorPlugin.ratioN)	__name__
__module____qualname____doc__r&   r(   r+   propertyr-   r$   r$   r$   r%   r       s   r   c                   @  sZ   e Zd ZddddZddddd	Zdddd
dZddddZeddddZdS ) TooManySymbolOrPunctuationPluginr'   r)   c                 C  s"   d| _ d| _d| _d | _d| _d S )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_charZ_frenzy_symbol_in_wordr*   r$   r$   r%   __init__C   s
    z)TooManySymbolOrPunctuationPlugin.__init__r   r   r   c                 C  s   |  S Nisprintabler"   r$   r$   r%   r&   K   s    z)TooManySymbolOrPunctuationPlugin.eligiblec                 C  sp   |  j d7  _ || jkrf|tkrft|r8|  jd7  _n.| dkrft|rft|dkrf|  jd7  _|| _d S )Nr   F   )	r6   r7   r   r   r4   isdigitr   r   r5   r"   r$   r$   r%   r(   N   s    

z%TooManySymbolOrPunctuationPlugin.feedc                 C  s   d| _ d| _d| _d S Nr   )r4   r6   r5   r*   r$   r$   r%   r+   `   s    z&TooManySymbolOrPunctuationPlugin.resetr,   c                 C  s0   | j dkrdS | j| j | j  }|dkr,|S dS )Nr           333333?)r6   r4   r5   )r#   Zratio_of_punctuationr$   r$   r%   r-   e   s    

z&TooManySymbolOrPunctuationPlugin.ratioN	r.   r/   r0   r8   r&   r(   r+   r2   r-   r$   r$   r$   r%   r3   B   s   r3   c                   @  sZ   e Zd ZddddZddddd	Zdddd
dZddddZeddddZdS )TooManyAccentuatedPluginr'   r)   c                 C  s   d| _ d| _d S r>   r6   _accentuated_countr*   r$   r$   r%   r8   r   s    z!TooManyAccentuatedPlugin.__init__r   r   r   c                 C  s   |  S r9   )isalphar"   r$   r$   r%   r&   v   s    z!TooManyAccentuatedPlugin.eligiblec                 C  s(   |  j d7  _ t|r$|  jd7  _d S Nr   )r6   r	   rD   r"   r$   r$   r%   r(   y   s    zTooManyAccentuatedPlugin.feedc                 C  s   d| _ d| _d S r>   rC   r*   r$   r$   r%   r+      s    zTooManyAccentuatedPlugin.resetr,   c                 C  s*   | j dk rdS | j| j  }|dkr&|S dS )N   r?   gffffff?rC   )r#   Zratio_of_accentuationr$   r$   r%   r-      s    
zTooManyAccentuatedPlugin.ratioNrA   r$   r$   r$   r%   rB   q   s   rB   c                   @  sZ   e Zd ZddddZddddd	Zdddd
dZddddZeddddZdS )UnprintablePluginr'   r)   c                 C  s   d| _ d| _d S r>   )_unprintable_countr6   r*   r$   r$   r%   r8      s    zUnprintablePlugin.__init__r   r   r   c                 C  s   dS NTr$   r"   r$   r$   r%   r&      s    zUnprintablePlugin.eligiblec                 C  s(   t |r|  jd7  _|  jd7  _d S rF   )r   rI   r6   r"   r$   r$   r%   r(      s    zUnprintablePlugin.feedc                 C  s
   d| _ d S r>   )rI   r*   r$   r$   r%   r+      s    zUnprintablePlugin.resetr,   c                 C  s   | j dkrdS | jd | j  S )Nr   r?   rG   )r6   rI   r*   r$   r$   r%   r-      s    
zUnprintablePlugin.ratioNrA   r$   r$   r$   r%   rH      s   rH   c                   @  sZ   e Zd ZddddZddddd	Zdddd
dZddddZeddddZdS )SuspiciousDuplicateAccentPluginr'   r)   c                 C  s   d| _ d| _d | _d S r>   _successive_countr6   _last_latin_characterr*   r$   r$   r%   r8      s    z(SuspiciousDuplicateAccentPlugin.__init__r   r   r   c                 C  s   |  ot|S r9   )rE   r   r"   r$   r$   r%   r&      s    z(SuspiciousDuplicateAccentPlugin.eligiblec                 C  st   |  j d7  _ | jd k	rjt|rjt| jrj| rJ| j rJ|  jd7  _t|t| jkrj|  jd7  _|| _d S rF   )r6   rN   r	   isupperrM   r   r"   r$   r$   r%   r(      s    z$SuspiciousDuplicateAccentPlugin.feedc                 C  s   d| _ d| _d | _d S r>   rL   r*   r$   r$   r%   r+      s    z%SuspiciousDuplicateAccentPlugin.resetr,   c                 C  s   | j dkrdS | jd | j  S )Nr   r?   r<   )r6   rM   r*   r$   r$   r%   r-      s    
z%SuspiciousDuplicateAccentPlugin.ratioNrA   r$   r$   r$   r%   rK      s   rK   c                   @  sZ   e Zd ZddddZddddd	Zdddd
dZddddZeddddZdS )SuspiciousRanger'   r)   c                 C  s   d| _ d| _d | _d S r>   )"_suspicious_successive_range_countr6   _last_printable_seenr*   r$   r$   r%   r8      s    zSuspiciousRange.__init__r   r   r   c                 C  s   |  S r9   r:   r"   r$   r$   r%   r&      s    zSuspiciousRange.eligiblec                 C  sx   |  j d7  _ | s&t|s&|tkr0d | _d S | jd krD|| _d S t| j}t|}t||rn|  jd7  _|| _d S rF   )r6   isspacer   r   rR   r    is_suspiciously_successive_rangerQ   )r#   r   unicode_range_aunicode_range_br$   r$   r%   r(      s"    


zSuspiciousRange.feedc                 C  s   d| _ d| _d | _d S r>   )r6   rQ   rR   r*   r$   r$   r%   r+      s    zSuspiciousRange.resetr,   c                 C  s"   | j dkrdS | jd | j  }|S )N   r?   r<   )r6   rQ   )r#   Zratio_of_suspicious_range_usager$   r$   r%   r-      s    
zSuspiciousRange.ratioNrA   r$   r$   r$   r%   rP      s   rP   c                   @  sZ   e Zd ZddddZddddd	Zdddd
dZddddZeddddZdS )SuperWeirdWordPluginr'   r)   c                 C  s@   d| _ d| _d| _d| _d| _d| _d| _d| _d| _d| _	d S )Nr   F )
_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchr6   _bad_character_count_buffer_buffer_accent_count_buffer_glyph_countr*   r$   r$   r%   r8      s    zSuperWeirdWordPlugin.__init__r   r   r   c                 C  s   dS rJ   r$   r"   r$   r$   r%   r&   
  s    zSuperWeirdWordPlugin.eligiblec                 C  s  |  r|  j|7  _t|r,|  jd7  _| jdkrt|dksJt|rt|dkrt|dkrt|dkrt	|dkrt
|dkrd| _t|st|st|st	|st
|r|  jd7  _d S | jsd S | st|st|r\| jr\|  jd7  _t| j}|  j|7  _|dkr| j| dkr:d| _npt| jd r| jd  rtdd | jD dkr|  jd7  _d| _n | jdkrd| _|  jd7  _|d	kr| jrd
d t| jtd|D }d}|rt|| dkrd}|s|  jd7  _d| _| jrB|  jd7  _|  jt| j7  _d| _d| _d| _d| _d| _n6|dkr| dkrt|rd| _|  j|7  _d S )Nr   FT   g      ?c                 s  s   | ]}|  V  qd S r9   rO   ).0_r$   r$   r%   	<genexpr>7  s     z,SuperWeirdWordPlugin.feed.<locals>.<genexpr>   c                 S  s   g | ]\}}|  r|qS r$   re   )rf   cir$   r$   r%   
<listcomp>?  s   z-SuperWeirdWordPlugin.feed.<locals>.<listcomp>r   r@   rY   >   ><~-|rg   =)rE   r`   r	   ra   r^   r   r   r   r   r   r   rb   rS   r   r   rZ   lenr6   r]   rO   allr\   zipranger[   r_   r=   r   )r#   r   Zbuffer_lengthZcamel_case_dstZprobable_camel_casedr$   r$   r%   r(     s    





	


zSuperWeirdWordPlugin.feedc                 C  s4   d| _ d| _d| _d| _d| _d| _d| _d| _d S )NrY   Fr   )r`   r]   r^   r[   rZ   r6   r_   r\   r*   r$   r$   r%   r+   ^  s    zSuperWeirdWordPlugin.resetr,   c                 C  s$   | j dkr| jdkrdS | j| j S )N
   r   r?   )rZ   r\   r_   r6   r*   r$   r$   r%   r-   h  s    zSuperWeirdWordPlugin.ratioNrA   r$   r$   r$   r%   rX      s   Q
rX   c                   @  s^   e Zd ZdZddddZdddd	d
ZdddddZddddZeddddZ	dS )CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
    can be easily detected. Searching for the overuse of '丅' and '丄'.
    r'   r)   c                 C  s   d| _ d| _d S r>   _wrong_stop_count_cjk_character_countr*   r$   r$   r%   r8   v  s    zCjkInvalidStopPlugin.__init__r   r   r   c                 C  s   dS rJ   r$   r"   r$   r$   r%   r&   z  s    zCjkInvalidStopPlugin.eligiblec                 C  s4   |dkr|  j d7  _ d S t|r0|  jd7  _d S )N>      丄   丅r   )rz   r   r{   r"   r$   r$   r%   r(   }  s
    zCjkInvalidStopPlugin.feedc                 C  s   d| _ d| _d S r>   ry   r*   r$   r$   r%   r+     s    zCjkInvalidStopPlugin.resetr,   c                 C  s   | j dk rdS | j| j  S )N   r?   )r{   rz   r*   r$   r$   r%   r-     s    
zCjkInvalidStopPlugin.ratioN)
r.   r/   r0   r1   r8   r&   r(   r+   r2   r-   r$   r$   r$   r%   rx   p  s   rx   c                   @  sZ   e Zd ZddddZddddd	Zdddd
dZddddZeddddZdS )ArchaicUpperLowerPluginr'   r)   c                 C  s.   d| _ d| _d| _d| _d| _d | _d| _d S )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr6   _last_alpha_seen_current_ascii_onlyr*   r$   r$   r%   r8     s    z ArchaicUpperLowerPlugin.__init__r   r   r   c                 C  s   dS rJ   r$   r"   r$   r$   r%   r&     s    z ArchaicUpperLowerPlugin.eligiblec                 C  s$  |  ot|}|dk}|r| jdkr| jdkrV| dkrV| jdkrV|  j| j7  _d| _d| _d | _d| _|  j	d7  _	d| _d S | jdkr|
 dkrd| _| jd k	r| r| j s| r| j r| jdkr|  jd7  _d| _qd| _nd| _|  j	d7  _	|  jd7  _|| _d S )NFr   @   r   Tr<   )rE   r   r   r=   r   r   r   r   r   r6   isasciirO   islower)r#   r   Zis_concernedZ	chunk_sepr$   r$   r%   r(     sF    


zArchaicUpperLowerPlugin.feedc                 C  s.   d| _ d| _d| _d| _d | _d| _d| _d S )Nr   FT)r6   r   r   r   r   r   r   r*   r$   r$   r%   r+     s    zArchaicUpperLowerPlugin.resetr,   c                 C  s   | j dkrdS | j| j  S )Nr   r?   )r6   r   r*   r$   r$   r%   r-     s    
zArchaicUpperLowerPlugin.ratioNrA   r$   r$   r$   r%   r     s   *	r   c                   @  sZ   e Zd ZddddZddddZddd	d
dZddd	ddZeddddZdS )ArabicIsolatedFormPluginr'   r)   c                 C  s   d| _ d| _d S r>   r6   _isolated_form_countr*   r$   r$   r%   r8     s    z!ArabicIsolatedFormPlugin.__init__c                 C  s   d| _ d| _d S r>   r   r*   r$   r$   r%   r+     s    zArabicIsolatedFormPlugin.resetr   r   r   c                 C  s   t |S r9   )r
   r"   r$   r$   r%   r&     s    z!ArabicIsolatedFormPlugin.eligiblec                 C  s(   |  j d7  _ t|r$|  jd7  _d S rF   )r6   r   r   r"   r$   r$   r%   r(     s    zArabicIsolatedFormPlugin.feedr,   c                 C  s   | j dk rdS | j| j  }|S )NrG   r?   r   )r#   Zisolated_form_usager$   r$   r%   r-     s    
zArabicIsolatedFormPlugin.ratioN)	r.   r/   r0   r8   r+   r&   r(   r2   r-   r$   r$   r$   r%   r     s   r      )maxsizez
str | Noner   )rU   rV   r   c                 C  s  | dks|dkrdS | |kr dS d| kr4d|kr4dS d| ksDd|krHdS d| ksXd|krld| kshd|krldS |  d| d }}|D ]}|tkrq||kr dS q| dk|dk }}|s|rd	| ksd	|krdS |r|rdS d
| ksd
|kr d	| ksd	|krdS | dks|dkr dS d	| ksHd	|ksH| dkr|dkrd| ks\d|kr`dS d| kstd|krxdS | dks|dkrdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFZLatinZ	EmoticonsZ	Combining )HiraganaKatakanaCJKZHangulzBasic Latin)r   r   ZPunctuationZForms)splitr   )rU   rV   Zkeywords_range_aZkeywords_range_belZrange_a_jp_charsZrange_b_jp_charsr$   r$   r%   rT     sh    rT   i   皙?Fr   r,   )decoded_sequencemaximum_thresholddebugr   c              	   C  sX  dd t  D }t| d }d}|dk r0d}n|dkr>d}nd	}t| d
 t|D ]d\}}|D ]}	|	|r`|	| q`|dkr|| dks||d krTtdd |D }||krT qqT|rNtd}
|
	t
d| d| d|  t| dkr(|
	t
d| dd   |
	t
d| dd   |D ] }|
	t
|j d|j  q,t|dS )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    c                 S  s   g | ]
}| qS r$   r$   )rf   Zmd_classr$   r$   r%   rl   I  s    zmess_ratio.<locals>.<listcomp>r   r?   i       r   r      
r   c                 s  s   | ]}|j V  qd S r9   )r-   )rf   dtr$   r$   r%   rh   `  s     zmess_ratio.<locals>.<genexpr>Zcharset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=r~   zStarting with: NzEnding with: iz:    )r   __subclasses__rs   ru   rv   r&   r(   sumr   logr   	__class__r-   round)r   r   r   Z	detectorslengthZmean_mess_ratioZ!intermediary_mean_mess_ratio_calcr   indexdetectorloggerr   r$   r$   r%   
mess_ratioA  sF    


r   N)r   F)(
__future__r   	functoolsr   loggingr   Zconstantr   r   r   utilsr	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r3   rB   rH   rK   rP   rX   rx   r   r   rT   r   r$   r$   r$   r%   <module>   s(   L"/%1vLI   