U
    ڲg.                     @  s.  d dl mZ d dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dlmZ d dlmZ d dlmZ d	d
lmZmZmZmZmZmZ e
eddddddZe
eddddddZe
eddddddZe
eddddddZe
eddddddZe
eddddddZe
eddddddZe
eddddddZe
eddddd d!Z e
eddddd"d#Z!e
eddddd$d%Z"e
eddddd&d'Z#e
eddddd(d)Z$e
eddddd*d+Z%e
eddddd,d-Z&e
eddddd.d/Z'e
e(edddd0d1d2Z)e
eddddd3d4Z*d[d6d7dd8d9d:Z+e
d;dddd<d=d>Z,d6d?d@dAdBZ-dddCdDdEZ.d\ddddGdHdIZ/dddJdKdLdMZ0ddddKdNdOZ1dPej2dQfdd7ddRdSdTdUZ3d]d6ddVd7ddd6dddWdX
dYdZZ4dS )^    )annotationsN)IncrementalDecoder)aliases)	lru_cache)findall)	Generator)MultibyteIncrementalDecoder   )ENCODING_MARKSIANA_SUPPORTED_SIMILARRE_POSSIBLE_ENCODING_INDICATIONUNICODE_RANGES_COMBINEDUNICODE_SECONDARY_RANGE_KEYWORDUTF8_MAXIMAL_ALLOCATION)maxsizestrbool)	characterreturnc                 C  sf   zt | }W n tk
r$   Y dS X d|kpdd|kpdd|kpdd|kpdd|kpdd|kpdd|kpdd	|kS )
NFz
WITH GRAVEz
WITH ACUTEzWITH CEDILLAzWITH DIAERESISzWITH CIRCUMFLEXz
WITH TILDEzWITH MACRONzWITH RING ABOVEunicodedataname
ValueErrorr   description r   </tmp/pip-unpacked-wheel-y7coycdt/charset_normalizer/utils.pyis_accentuated   s&    r   c                 C  s.   t | }|s| S |d}tt|d dS )N r      )r   decompositionsplitchrint)r   Z
decomposedcodesr   r   r   remove_accent,   s
    

r%   z
str | Nonec                 C  s.   t | }t D ]\}}||kr|  S qdS )zK
    Retrieve the Unicode range official name from a single character.
    N)ordr   items)r   Zcharacter_ord
range_nameZ	ord_ranger   r   r   unicode_range7   s
    
r)   c                 C  s.   zt | }W n tk
r$   Y dS X d|kS )NFZLATINr   r   r   r   r   is_latinE   s
    r*   c                 C  s2   t | }d|krdS t| }|d kr*dS d|kS )NPTFZPunctuationr   categoryr)   r   character_categorycharacter_ranger   r   r   is_punctuationN   s    
r1   c                 C  sB   t | }d|ksd|krdS t| }|d kr2dS d|ko@|dkS )NSNTFZFormsZLor,   r.   r   r   r   	is_symbol]   s    
r4   c                 C  s$   t | }|d krdS d|kp"d|kS )NFZ	EmoticonsZPictographs)r)   )r   r0   r   r   r   is_emoticonl   s    r5   c                 C  s.   |   s| dkrdS t| }d|kp,|dkS )N>      ｜><+TZ>   PdPcPo)isspacer   r-   )r   r/   r   r   r   is_separatorv   s    
r?   c                 C  s   |   |  kS N)islowerisupperr   r   r   r   is_case_variable   s    rD   c                 C  s.   zt | }W n tk
r$   Y dS X d|kS )NFCJKr   r   Zcharacter_namer   r   r   is_cjk   s
    rG   c                 C  s.   zt | }W n tk
r$   Y dS X d|kS )NFZHIRAGANAr   rF   r   r   r   is_hiragana   s
    rH   c                 C  s.   zt | }W n tk
r$   Y dS X d|kS )NFZKATAKANAr   rF   r   r   r   is_katakana   s
    rI   c                 C  s.   zt | }W n tk
r$   Y dS X d|kS )NFZHANGULr   rF   r   r   r   	is_hangul   s
    rJ   c                 C  s.   zt | }W n tk
r$   Y dS X d|kS )NFZTHAIr   rF   r   r   r   is_thai   s
    rK   c                 C  s.   zt | }W n tk
r$   Y dS X d|kS )NFARABICr   rF   r   r   r   	is_arabic   s
    rM   c                 C  s6   zt | }W n tk
r$   Y dS X d|ko4d|kS )NFrL   zISOLATED FORMr   rF   r   r   r   is_arabic_isolated_form   s
    rN   )r(   r   c                   s   t  fddtD S )Nc                 3  s   | ]}| kV  qd S r@   r   ).0keywordr(   r   r   	<genexpr>   s     z-is_unicode_range_secondary.<locals>.<genexpr>)anyr   rQ   r   rQ   r   is_unicode_range_secondary   s    rT   c                 C  s(   |   dko&|  dko&| dko&| dkS )NFu   ﻿)r>   isprintablerC   r   r   r   is_unprintable   s    
rW       bytesr#   )sequencesearch_zoner   c                 C  s   t | tstt| }tt| dt|| jddd}t|dkrHdS |D ]N}| 	dd}t
 D ]0\}}||kr|    S ||krh|    S qhqLdS )zW
    Extract using ASCII-only decoder any specified encoding in the first n-bytes.
    Nasciiignoreerrorsr   -_)
isinstancerY   	TypeErrorlenr   r   mindecodelowerreplacer   r'   )rZ   r[   Zseq_lenresultsZspecified_encodingencoding_aliasencoding_ianar   r   r   any_specified_encoding   s"    
rl      )r   r   c                 C  s    | dkpt td|  jtS )zQ
    Verify is a specific encoding is a multi byte one based on it IANA name
    >	   utf_8utf_7	utf_8_sig	utf_32_beutf_16	utf_16_le	utf_16_be	utf_32_leutf_32
encodings.)
issubclass	importlibimport_moduler   r   )r   r   r   r   is_multi_byte_encoding   s    
r{   ztuple[str | None, bytes])rZ   r   c                 C  sJ   t D ]@}t | }t|tr |g}|D ]}| |r$||f    S q$qdS )z9
    Identify and extract SIG/BOM in given sequence.
    )N    )r
   rb   rY   
startswith)rZ   iana_encodingZmarksmarkr   r   r   identify_sig_or_bom  s    

r   )r~   r   c                 C  s   | dkS )N>   rr   rv   r   )r~   r   r   r   should_strip_sig_or_bom"  s    r   T)cp_namestrictr   c                 C  sN   |   dd} t D ]\}}| ||fkr|  S q|rJtd|  d| S )zIReturns the Python normalized encoding name (Not the IANA official name).r`   ra   zUnable to retrieve IANA for '')rg   rh   r   r'   r   )r   r   rj   rk   r   r   r   	iana_name&  s    
r   float)iana_name_aiana_name_br   c           	      C  s   t | st |rdS td|  j}td| j}|dd}|dd}d}tdD ]*}t|g}||||krX|d7 }qX|d S )	Ng        rw   r]   r^   r      r	      )r{   ry   rz   r   rangerY   rf   )	r   r   Z	decoder_aZ	decoder_bZid_aZid_bZcharacter_match_countiZto_be_decodedr   r   r   cp_similarity7  s    



r   c                 C  s   | t ko|t |  kS )z
    Determine if two code page are at least 80% similar. IANA_SUPPORTED_SIMILAR dict was generated using
    the function cp_similarity.
    )r   )r   r   r   r   r   is_cp_similarK  s    
r   Zcharset_normalizerz)%(asctime)s | %(levelname)s | %(message)sNone)r   levelformat_stringr   c                 C  s:   t | }|| t  }|t | || d S r@   )logging	getLoggersetLevelStreamHandlersetFormatter	Formatter
addHandler)r   r   r   loggerhandlerr   r   r   set_logging_handlerV  s
    

r   r   zGenerator[str, None, None])
	sequencesrk   offsets
chunk_sizebom_or_sig_availablestrip_sig_or_bomsig_payloadis_multi_byte_decoderdecoded_payloadr   c	                 c  s*  |r6|dkr6|D ]"}	||	|	|  }
|
s, q4|
V  qn|D ]}	|	| }|t | d krXq:| |	|	|  }|r||dkr||| }|j||rdndd}
|r|	dkrt|d}|r|
d | |krt|	|	d d	D ]H}| || }|r|dkr|| }|j|dd}
|
d | |kr qq|
V  q:d S )
NF   r]   r   r^   r   r      )rd   rf   re   r   )r   rk   r   r   r   r   r   r   r   r   chunkZ	chunk_endZcut_sequenceZchunk_partial_size_chkjr   r   r   cut_sequence_chunksc  s>    


r   )rX   )T)N)5
__future__r   ry   r   r   codecsr   Zencodings.aliasesr   	functoolsr   rer   typingr   Z_multibytecodecr   Zconstantr
   r   r   r   r   r   r   r%   r)   r*   r1   r4   r5   r?   rD   rG   rH   rI   rJ   rK   rM   rN   rd   rT   rW   rl   r{   r   r   r   r   r   INFOr   r   r   r   r   r   <module>   sx    

									
  