U
    ڲgYX                     @  sR  d dl mZ d dlZd dlmZ d dlmZ ddlmZm	Z	m
Z
mZ ddlmZmZmZmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZmZmZmZ edZe  Z!e!"e#d d(ddddddddddddddZ$d)ddddddddddddddZ%d*d ddddddddddd!d"d#Z&d+d$ddddddddddd%d&d'Z'dS ),    )annotationsN)PathLike)BinaryIO   )coherence_ratioencoding_languagesmb_encoding_languagesmerge_coherence_ratios)IANA_SUPPORTEDTOO_BIG_SEQUENCETOO_SMALL_SEQUENCETRACE)
mess_ratio)CharsetMatchCharsetMatches)any_specified_encodingcut_sequence_chunks	iana_nameidentify_sig_or_bomis_cp_similaris_multi_byte_encodingshould_strip_sig_or_bomZcharset_normalizerz)%(asctime)s | %(levelname)s | %(message)s      皙?TF皙?zbytes | bytearrayintfloatzlist[str] | Noneboolr   )	sequencessteps
chunk_size	thresholdcp_isolationcp_exclusionpreemptive_behaviourexplainlanguage_thresholdenable_fallbackreturnc
           2      C  s	  t | ttfs tdt| |r>tj}
tt	 t
t t| }|dkrtd |rvtt	 t
|
prtj tt| dddg dgS |dk	rttd	d
| dd |D }ng }|dk	rttdd
| dd |D }ng }||| krttd||| d}|}|dkr:|| |k r:t|| }t| tk }t| tk}|rlttd| n|rttd| g }|rt| nd}|dk	r|| ttd| t }g }g }d}d}d}t }t }t| \}}|dk	r|| ttdt|| |d d|kr4|d |t D ]~}|rV||krVq<|rj||krjq<||krxq<|| d}||k}|ot|}|dkr|sttd| q<|dkr|sttd| q<zt|}W n. t t!fk
r   ttd| Y q<Y nX zr|rf|dkrft"|dkrJ| dtd n| t|td |d n&t"|dkrv| n| t|d |d}W n\ t#t$fk
r } z8t |t$sttd|t"| || W Y q<W 5 d}~X Y nX d} |D ]}!t%||!rd}  qq| r,ttd||! q<t&|s8dnt||t|| }"|oh|dk	oht||k }#|#r~ttd | tt|"d! }$t'|$d"}$d}%d}&g }'g }(zt(| ||"||||||	D ]|})|'|) |(t)|)||dko dt|  kod"kn   |(d# |kr|%d7 }%|%|$ks6|r|dkr q@qW n@ t#k
r } z ttd$|t"| |$}%d}&W 5 d}~X Y nX |&s|r|sz| td%d j*|d&d' W nL t#k
r  } z,ttd(|t"| || W Y q<W 5 d}~X Y nX |(rt+|(t|( nd}*|*|ks0|%|$kr|| ttd)||%t,|*d* d+d, |	r<|dd|fkr<|&s<t| ||dg ||d-}+||kr|+}n|dkr|+}n|+}q<ttd.|t,|*d* d+d, |st-|},nt.|},|,rttd/|t"|, g }-|dkrD|'D ],})t/|)||,r0d0|,nd}.|-|. qt0|-}/|/rfttd1|/| t| ||*||/|dks||ddfkr|nd|d-}0||0 ||ddfkr|*d2k r|*dkrtd3|0j1 |rtt	 t
|
 t|0g  S ||0 t|rx|dks&||krxd|krxd|krx|2 }1td3|1j1 |rjtt	 t
|
 t|1g  S ||kr<td4| |rtt	 t
|
 t|| g  S q<t|dk	rn|s|s|rttd5 |	r
td6|j1 || nd|	r|dk	s>|	r4|	r4|j3|j3k	s>|dk		rTtd7 || n|	rntd8 || |	rtd9|2 j1t|d  n
td: |	rtt	 t
|
 |S );af  
    Given a raw bytes sequence, return the best possibles charset usable to render str objects.
    If there is no results, it is a strong indicator that the source is binary/not text.
    By default, the process will extract 5 blocks of 512o each to assess the mess and coherence of a given sequence.
    And will give up a particular code page after 20% of measured mess. Those criteria are customizable at will.

    The preemptive behavior DOES NOT replace the traditional detection workflow, it prioritize a particular code page
    but never take it for granted. Can improve the performance.

    You may want to focus your attention to some code page or/and not others, use cp_isolation and cp_exclusion for that
    purpose.

    This function will strip the SIG in the payload/sequence every time except on UTF-16, UTF-32.
    By default the library does not setup any handler other than the NullHandler, if you choose to set the 'explain'
    toggle to True it will alter the logger configuration to add a StreamHandler that is suitable for debugging.
    Custom logging format and handler can be set manually.
    z3Expected object of type bytes or bytearray, got: {}r   z<Encoding detection on empty bytes, assuming utf_8 intention.utf_8g        F Nz`cp_isolation is set. use this flag for debugging purpose. limited list of encoding allowed : %s.z, c                 S  s   g | ]}t |d qS Fr   .0cp r1   :/tmp/pip-unpacked-wheel-y7coycdt/charset_normalizer/api.py
<listcomp>[   s     zfrom_bytes.<locals>.<listcomp>zacp_exclusion is set. use this flag for debugging purpose. limited list of encoding excluded : %s.c                 S  s   g | ]}t |d qS r,   r-   r.   r1   r1   r2   r3   f   s     z^override steps (%i) and chunk_size (%i) as content does not fit (%i byte(s) given) parameters.r   z>Trying to detect encoding from a tiny portion of ({}) byte(s).zIUsing lazy str decoding because the payload is quite large, ({}) byte(s).z@Detected declarative mark in sequence. Priority +1 given for %s.zIDetected a SIG or BOM mark on first %i byte(s). Priority +1 given for %s.ascii>   utf_16utf_32z\Encoding %s won't be tested as-is because it require a BOM. Will try some sub-encoder LE/BE.>   utf_7zREncoding %s won't be tested as-is because detection is unreliable without BOM/SIG.z2Encoding %s does not provide an IncrementalDecoderg    A)encodingz9Code page %s does not fit given bytes sequence at ALL. %sTzW%s is deemed too similar to code page %s and was consider unsuited already. Continuing!zpCode page %s is a multi byte encoding table and it appear that at least one character was encoded using n-bytes.      zaLazyStr Loading: After MD chunk decode, code page %s does not fit given bytes sequence at ALL. %sg     j@strict)errorsz^LazyStr Loading: After final lookup, code page %s does not fit given bytes sequence at ALL. %szc%s was excluded because of initial chaos probing. Gave up %i time(s). Computed mean chaos is %f %%.d      )ndigits)Zpreemptive_declarationz=%s passed initial chaos probing. Mean measured chaos is %f %%z&{} should target any language(s) of {},z We detected language {} using {}r   z.Encoding detection: %s is most likely the one.zoEncoding detection: %s is most likely the one as we detected a BOM or SIG within the beginning of the sequence.zONothing got out of the detection process. Using ASCII/UTF-8/Specified fallback.z7Encoding detection: %s will be used as a fallback matchz:Encoding detection: utf_8 will be used as a fallback matchz:Encoding detection: ascii will be used as a fallback matchz]Encoding detection: Found %s as plausible (best-candidate) for content. With %i alternatives.z=Encoding detection: Unable to determine any suitable charset.)4
isinstance	bytearraybytes	TypeErrorformattypeloggerlevel
addHandlerexplain_handlersetLevelr   lendebugremoveHandlerloggingWARNINGr   r   logjoinr   r   r   r   appendsetr   r
   addr   r   ModuleNotFoundErrorImportErrorstrUnicodeDecodeErrorLookupErrorr   rangemaxr   r   decodesumroundr   r   r   r	   r8   bestfingerprint)2r   r    r!   r"   r#   r$   r%   r&   r'   r(   Zprevious_logger_levellengthZis_too_small_sequenceZis_too_large_sequenceZprioritized_encodingsZspecified_encodingZtestedZtested_but_hard_failureZtested_but_soft_failureZfallback_asciiZfallback_u8Zfallback_specifiedresultsZearly_stop_resultsZsig_encodingZsig_payloadZencoding_ianaZdecoded_payloadZbom_or_sig_availableZstrip_sig_or_bomZis_multi_byte_decodereZsimilar_soft_failure_testZencoding_soft_failedZr_Zmulti_byte_bonusZmax_chunk_gave_upZearly_stop_countZlazy_str_hard_failureZ	md_chunksZ	md_ratioschunkZmean_mess_ratioZfallback_entryZtarget_languagesZ	cd_ratiosZchunk_languagesZcd_ratios_mergedZcurrent_matchZprobable_resultr1   r1   r2   
from_bytes!   s(   












	






&



	


 
 










	






rg   r   )fpr    r!   r"   r#   r$   r%   r&   r'   r(   r)   c
           
      C  s   t |  |||||||||	
S )z
    Same thing than the function from_bytes but using a file pointer that is already ready.
    Will not close the file pointer.
    )rg   read)
rh   r    r!   r"   r#   r$   r%   r&   r'   r(   r1   r1   r2   from_fp   s    rj   zstr | bytes | PathLike)pathr    r!   r"   r#   r$   r%   r&   r'   r(   r)   c
                 C  s<   t | d(}
t|
|||||||||	
W  5 Q R  S Q R X dS )z
    Same thing than the function from_bytes but with one extra step. Opening and reading given file path in binary mode.
    Can raise IOError.
    rbN)openrj   )rk   r    r!   r"   r#   r$   r%   r&   r'   r(   rh   r1   r1   r2   	from_path>  s    rn   z!PathLike | str | BinaryIO | bytes)fp_or_path_or_payloadr    r!   r"   r#   r$   r%   r&   r'   r(   r)   c
                 C  sz   t | ttfr,t| |||||||||	d
}
nHt | ttfrXt| |||||||||	d
}
nt| |||||||||	d
}
|
 S )a)  
    Detect if the given input (file, bytes, or path) points to a binary file. aka. not a string.
    Based on the same main heuristic algorithms and default kwargs at the sole exception that fallbacks match
    are disabled to be stricter around ASCII-compatible but unlikely to be a string.
    )	r    r!   r"   r#   r$   r%   r&   r'   r(   )rB   rY   r   rn   rD   rC   rg   rj   )ro   r    r!   r"   r#   r$   r%   r&   r'   r(   Zguessesr1   r1   r2   	is_binary]  sX    rp   )	r   r   r   NNTFr   T)	r   r   r   NNTFr   T)	r   r   r   NNTFr   T)	r   r   r   NNTFr   F)(
__future__r   rP   osr   typingr   Zcdr   r   r   r	   Zconstantr
   r   r   r   Zmdr   modelsr   r   utilsr   r   r   r   r   r   r   	getLoggerrH   StreamHandlerrK   setFormatter	Formatterrg   rj   rn   rp   r1   r1   r1   r2   <module>   sr   $

         $             $          $!         