o
    	cRG                  
   @   sf  d dl mZ d dlmZ d dlmZmZ ddlmZm	Z	m
Z
 ddlmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ G dd dZG d	d
 d
eZG dd deZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$edddee% dee% de&fddZ'ed d	"d)d#e%d$e(d%e&de(fd&d'Z)d(S )*    )	lru_cache)	getLogger)ListOptional   )COMMON_SAFE_ASCII_CHARACTERSTRACEUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuatedis_asciiis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thaiis_unprintableremove_accentunicode_rangec                   @   sP   e Zd ZdZdedefddZdeddfddZdd	d
Ze	de
fddZdS )MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    	characterreturnc                 C      t )z@
        Determine if given character should be fed in.
        NotImplementedErrorselfr    r"   B/opt/certbot/lib/python3.10/site-packages/charset_normalizer/md.pyeligible$      zMessDetectorPlugin.eligibleNc                 C   r   )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        r   r    r"   r"   r#   feed*   s   zMessDetectorPlugin.feedc                 C   r   )zB
        Permit to reset the plugin to the initial state.
        r   r!   r"   r"   r#   reset1   r%   zMessDetectorPlugin.resetc                 C   r   )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        r   r'   r"   r"   r#   ratio7   s   zMessDetectorPlugin.ratior   N)__name__
__module____qualname____doc__strboolr$   r&   r(   propertyfloatr)   r"   r"   r"   r#   r      s    
r   c                   @   V   e Zd ZdddZdedefddZdeddfdd	Zdd
dZe	de
fddZdS ) TooManySymbolOrPunctuationPluginr   Nc                 C   s"   d| _ d| _d| _d | _d| _d S )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_char_frenzy_symbol_in_wordr'   r"   r"   r#   __init__A   s
   
z)TooManySymbolOrPunctuationPlugin.__init__r   c                 C      |  S Nisprintabler    r"   r"   r#   r$   I      z)TooManySymbolOrPunctuationPlugin.eligiblec                 C   sp   |  j d7  _ || jkr3|tvr3t|r|  jd7  _n| du r3t|r3t|du r3|  jd7  _|| _d S )Nr   F   )	r7   r8   r   r   r5   isdigitr   r   r6   r    r"   r"   r#   r&   L   s   


z%TooManySymbolOrPunctuationPlugin.feedc                 C   s   d| _ d| _d| _d S Nr   )r5   r7   r6   r'   r"   r"   r#   r(   ^      
z&TooManySymbolOrPunctuationPlugin.resetc                 C   s0   | j dkrdS | j| j | j  }|dkr|S dS )Nr           g333333?)r7   r5   r6   )r!   ratio_of_punctuationr"   r"   r#   r)   c   s   

z&TooManySymbolOrPunctuationPlugin.ratior*   r+   r,   r-   r:   r/   r0   r$   r&   r(   r1   r2   r)   r"   r"   r"   r#   r4   @   s    

r4   c                   @   r3   )TooManyAccentuatedPluginr   Nc                 C      d| _ d| _d S rB   r7   _accentuated_countr'   r"   r"   r#   r:   p      
z!TooManyAccentuatedPlugin.__init__r   c                 C   r;   r<   )isalphar    r"   r"   r#   r$   t   r?   z!TooManyAccentuatedPlugin.eligiblec                 C   s,   |  j d7  _ t|r|  jd7  _d S d S Nr   )r7   r
   rJ   r    r"   r"   r#   r&   w   s   zTooManyAccentuatedPlugin.feedc                 C   rH   rB   rI   r'   r"   r"   r#   r(   }   rK   zTooManyAccentuatedPlugin.resetc                 C   s4   | j dks
| j dk rdS | j| j  }|dkr|S dS )Nr      rD   gffffff?rI   )r!   ratio_of_accentuationr"   r"   r#   r)      s   zTooManyAccentuatedPlugin.ratior*   rF   r"   r"   r"   r#   rG   o   s    

rG   c                   @   r3   )UnprintablePluginr   Nc                 C   rH   rB   )_unprintable_countr7   r'   r"   r"   r#   r:      rK   zUnprintablePlugin.__init__r   c                 C      dS NTr"   r    r"   r"   r#   r$         zUnprintablePlugin.eligiblec                 C   s(   t |r|  jd7  _|  jd7  _d S rM   )r   rQ   r7   r    r"   r"   r#   r&      s   zUnprintablePlugin.feedc                 C   s
   d| _ d S rB   )rQ   r'   r"   r"   r#   r(      s   
zUnprintablePlugin.resetc                 C      | j dkrdS | jd | j  S )Nr   rD   rN   )r7   rQ   r'   r"   r"   r#   r)         
zUnprintablePlugin.ratior*   rF   r"   r"   r"   r#   rP      s    

rP   c                   @   r3   )SuspiciousDuplicateAccentPluginr   Nc                 C      d| _ d| _d | _d S rB   _successive_countr7   _last_latin_characterr'   r"   r"   r#   r:      s   
z(SuspiciousDuplicateAccentPlugin.__init__r   c                 C   s   |  ot|S r<   )rL   r   r    r"   r"   r#   r$      s   z(SuspiciousDuplicateAccentPlugin.eligiblec                 C   st   |  j d7  _ | jd ur5t|r5t| jr5| r%| j r%|  jd7  _t|t| jkr5|  jd7  _|| _d S rM   )r7   r[   r
   isupperrZ   r   r    r"   r"   r#   r&      s   
z$SuspiciousDuplicateAccentPlugin.feedc                 C   rX   rB   rY   r'   r"   r"   r#   r(      rC   z%SuspiciousDuplicateAccentPlugin.resetc                 C   rU   )Nr   rD   r@   )r7   rZ   r'   r"   r"   r#   r)      rV   z%SuspiciousDuplicateAccentPlugin.ratior*   rF   r"   r"   r"   r#   rW      s    

rW   c                   @   r3   )SuspiciousRanger   Nc                 C   rX   rB   )"_suspicious_successive_range_countr7   _last_printable_seenr'   r"   r"   r#   r:      rC   zSuspiciousRange.__init__r   c                 C   r;   r<   r=   r    r"   r"   r#   r$      r?   zSuspiciousRange.eligiblec                 C   sx   |  j d7  _ | st|s|tv rd | _d S | jd u r"|| _d S t| j}t|}t||r7|  jd7  _|| _d S rM   )r7   isspacer   r   r_   r    is_suspiciously_successive_ranger^   )r!   r   unicode_range_aunicode_range_br"   r"   r#   r&      s"   



zSuspiciousRange.feedc                 C   rX   rB   )r7   r^   r_   r'   r"   r"   r#   r(      rC   zSuspiciousRange.resetc                 C   s.   | j dkrdS | jd | j  }|dk rdS |S )Nr   rD   r@   g?)r7   r^   )r!   ratio_of_suspicious_range_usager"   r"   r#   r)      s   
zSuspiciousRange.ratior*   rF   r"   r"   r"   r#   r]      s    

r]   c                   @   r3   )SuperWeirdWordPluginr   Nc                 C   s:   d| _ d| _d| _d| _d| _d| _d| _d| _d| _d S )Nr   F )	_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchr7   _bad_character_count_buffer_buffer_accent_countr'   r"   r"   r#   r:      s   
zSuperWeirdWordPlugin.__init__r   c                 C   rR   rS   r"   r    r"   r"   r#   r$   	  rT   zSuperWeirdWordPlugin.eligiblec                 C   s  |  rH|  j|7  _t|r|  jd7  _| jdu rFt|du s%t|rFt|du rFt|du rFt|du rFt	|du rFt
|du rFd| _d S | jsMd S | sYt|sYt|r| jr|  jd7  _t| j}|  j|7  _|dkr| j| dkr}d| _t| jd r| jd  r|  jd7  _d| _|dkr| jr|  jd7  _d| _| jr|  jd7  _|  jt| j7  _d| _d| _d| _d	| _d S |d
vr| du rt|rd| _|  j|7  _d S d S d S d S )Nr   FT   g(\?   rf   r   >   _-<=>|~)rL   rm   r
   rn   rk   r   r   r   r   r   r   r`   r   r   rg   lenr7   rj   r\   ri   rh   rl   rA   r   )r!   r   buffer_lengthr"   r"   r#   r&     sx   





	


zSuperWeirdWordPlugin.feedc                 C   s4   d| _ d| _d| _d| _d| _d| _d| _d| _d S )Nrf   Fr   )rm   rj   rk   rh   rg   r7   rl   ri   r'   r"   r"   r#   r(   B  s   
zSuperWeirdWordPlugin.resetc                 C   s$   | j dkr| jdkrdS | j| j S )N
   r   rD   )rg   ri   rl   r7   r'   r"   r"   r#   r)   L  s   zSuperWeirdWordPlugin.ratior*   rF   r"   r"   r"   r#   re      s    

6
re   c                   @   sZ   e Zd ZdZdddZdedefddZdeddfd	d
ZdddZ	e
defddZdS )CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
    can be easily detected. Searching for the overuse of '丅' and '丄'.
    r   Nc                 C   rH   rB   _wrong_stop_count_cjk_character_countr'   r"   r"   r#   r:   Z  rK   zCjkInvalidStopPlugin.__init__r   c                 C   rR   rS   r"   r    r"   r"   r#   r$   ^  rT   zCjkInvalidStopPlugin.eligiblec                 C   s8   |dv r|  j d7  _ d S t|r|  jd7  _d S d S )N>      丄   丅r   )r~   r   r   r    r"   r"   r#   r&   a  s   zCjkInvalidStopPlugin.feedc                 C   rH   rB   r}   r'   r"   r"   r#   r(   h  rK   zCjkInvalidStopPlugin.resetc                 C   s   | j dk rdS | j| j  S )N   rD   )r   r~   r'   r"   r"   r#   r)   l  s   
zCjkInvalidStopPlugin.ratior*   )r+   r,   r-   r.   r:   r/   r0   r$   r&   r(   r1   r2   r)   r"   r"   r"   r#   r|   T  s    

r|   c                   @   r3   )ArchaicUpperLowerPluginr   Nc                 C   s.   d| _ d| _d| _d| _d| _d | _d| _d S )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr7   _last_alpha_seen_current_ascii_onlyr'   r"   r"   r#   r:   t  s   
z ArchaicUpperLowerPlugin.__init__r   c                 C   rR   rS   r"   r    r"   r"   r#   r$     rT   z ArchaicUpperLowerPlugin.eligiblec                 C   s$  |  ot|}|du }|rC| jdkrC| jdkr+| du r+| jdu r+|  j| j7  _d| _d| _d | _d| _|  j	d7  _	d| _d S | jdu rQt
|du rQd| _| jd ur| r_| j sh| r|| j r|| jdu rx|  jd7  _d| _nd| _nd| _|  j	d7  _	|  jd7  _|| _d S )NFr   @   r   Tr@   )rL   r   r   rA   r   r   r   r   r   r7   r   r\   islower)r!   r   is_concerned	chunk_sepr"   r"   r#   r&     sF   



zArchaicUpperLowerPlugin.feedc                 C   s.   d| _ d| _d| _d| _d | _d| _d| _d S )Nr   FT)r7   r   r   r   r   r   r   r'   r"   r"   r#   r(     s   
zArchaicUpperLowerPlugin.resetc                 C   s   | j dkrdS | j| j  S )Nr   rD   )r7   r   r'   r"   r"   r#   r)     s   
zArchaicUpperLowerPlugin.ratior*   rF   r"   r"   r"   r#   r   s  s    

*	r      )maxsizerb   rc   r   c                 C   sb  | du s|du r
dS | |krdS d| v rd|v rdS d| v s"d|v r$dS d| v s,d|v r6d| v s4d|v r6dS |  d| d}}|D ]}|tv rJqC||v rQ dS qC| dv |dv }}|s_|rid	| v sgd	|v ridS |ro|rodS d
| v swd
|v rd	| v sd	|v rdS | dks|dkrdS d	| v sd	|v s| dv r|dv rd| v sd|v rdS d| v sd|v rdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFLatin	Emoticons	Combining )HiraganaKatakanaCJKHangulzBasic Latin)r   r   PunctuationForms)splitr	   )rb   rc   keywords_range_akeywords_range_belrange_a_jp_charsrange_b_jp_charsr"   r"   r#   ra     sj   ra   i   皙?Fdecoded_sequencemaximum_thresholddebugc              	   C   sR  dd t  D }t| d }d}|dk rd}n	|dkrd}nd	}t| d
 t|D ]2\}}|D ]}	|	|r<|	| q0|dkrG|| dksM||d kr\tdd |D }||kr\ nq*|rtd}
|
	t
d| d| d|  t| dkr|
	t
d| dd   |
	t
d| dd   |D ]}|
	t
|j d|j  qt|dS )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    c                 S   s   g | ]}| qS r"   r"   ).0md_classr"   r"   r#   
<listcomp>  s    zmess_ratio.<locals>.<listcomp>r   rD   i       r   r      
r   c                 s   s    | ]}|j V  qd S r<   )r)   )r   dtr"   r"   r#   	<genexpr>%  s    zmess_ratio.<locals>.<genexpr>charset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=r   zStarting with: NzEnding with: iz:    )r   __subclasses__ry   zipranger$   r&   sumr   logr   	__class__r)   round)r   r   r   	detectorslengthmean_mess_ratio!intermediary_mean_mess_ratio_calcr   indexdetectorloggerr   r"   r"   r#   
mess_ratio  sV   




r   N)r   F)*	functoolsr   loggingr   typingr   r   constantr   r   r	   utilsr
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r4   rG   rP   rW   r]   re   r|   r   r/   r0   ra   r2   r   r"   r"   r"   r#   <module>   sB    H"/%4ZLF