
    iZ                       U d Z ddlmZ ddlmZ ddlmZ ddlmZm	Z	m
Z
mZ ddlmZmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z*m+Z+m,Z,  eded      Z- eddd      Z. eddd      Z/dZ0 e1h d      Z2de3d<    e1h d      Z4de3d<    e1h d      Z5de3d <    e1h d!      Z6de3d"<   e4e5e6d#Z7d$e3d%<    e1h d&      Z8de3d'<   d8d(Z9d)Z:d*Z;d+Z<d,Z=d-Z>	 	 	 	 	 	 	 	 d9d.Z?	 	 	 	 	 	 	 	 	 	 d:d/Z@	 	 	 	 	 	 d;d0ZA	 	 	 	 	 	 d;d1ZBd2ZCd<d3ZD	 	 	 	 	 	 d;d4ZE	 	 	 	 	 	 d;d5ZFef	 	 	 	 	 	 	 d=d6ZGef	 	 	 	 	 	 	 d=d7ZHy)>u@   Pipeline orchestrator — runs all detection stages in sequence.    )annotations)DEFAULT_MAX_BYTES)EncodingEra)BigramProfilehas_model_variantsinfer_languagescore_best_language)DETERMINISTIC_CONFIDENCE
HIGH_BYTESDetectionResultPipelineContext)detect_ascii)	is_binary)
detect_bom)resolve_confusion_groups)detect_escape_encoding)detect_markup_charset)score_candidates)compute_lead_byte_diversitycompute_multibyte_byte_coveragecompute_structural_score)detect_utf8)detect_utf1632_patterns)filter_by_validity)REGISTRYEncodingInfoget_candidatesNencoding
confidencelanguageutf-8g?windows-1252g333333?>   
iso-8859-1iso-8859-15r#   zfrozenset[str]_COMMON_LATIN_ENCODINGS>.                                                                                                                                             zfrozenset[int]_ISO_8859_10_DISTINGUISHING>   r'   r(   r*   r+   r,   r-   r/   r0   r1   r3      r4   r5   r6   r7   r8      r:   r;   r<   r=   r>   r?   r@   rA      rI         rR      _ISO_8859_14_DISTINGUISHING>   rX      rY   rZ      r[   _WINDOWS_1254_DISTINGUISHING)ziso-8859-10ziso-8859-14zwindows-1254zdict[str, frozenset[int]]_DEMOTION_CANDIDATES>                           r'   r(   r+   r8   _KOI8_T_DISTINGUISHINGc                ^    t         j                  |       yt        fd|D               S )au  Return True if encoding is a demotion candidate with no distinguishing bytes.

    Checks whether any non-ASCII byte in *data* falls in the set of byte
    values that decode differently under the given encoding vs iso-8859-1.
    If none do, the data is equally valid under both encodings and there is
    no byte-level evidence for preferring the candidate encoding.
    Fc              3  2   K   | ]  }|d kD  s	|v   yw   N ).0bdistinguishings     m/Users/bowang/.openclaw/workspace/ChatDev/.venv/lib/python3.12/site-packages/chardet/pipeline/orchestrator.py	<genexpr>z!_should_demote.<locals>.<genexpr>   s     A1D1&s   
	)r`   getany)r   datarq   s     @rr   _should_demoterw      s2     *--h7NAAAAA    g?   gffffff?      c                4   g }|D ]  }|j                   rt        | ||      }||j                  |j                  <   |t        k  r@|j
                  0t        |       t        | j                  dt                    z
  |_        |j
                  t        k  rt        | |||j
                        }||j                  |j                  <   |t        k  r|j
                  t        k\  rt        | ||      }|t        k  r|j!                  |       	 t#        |      S )a  Eliminate CJK multi-byte candidates that lack genuine multi-byte structure.

    Four checks are applied in order to each multi-byte candidate:

    1. **Structural pair ratio** (valid_pairs / lead_bytes) must be
       >= ``_CJK_MIN_MB_RATIO``.  Catches files with many orphan lead bytes.

    2. **Minimum non-ASCII byte count**: the data must contain at least
       ``_CJK_MIN_NON_ASCII`` bytes > 0x7F.  Tiny files with 1-5 high bytes
       can accidentally form perfect pairs and score 1.0 structurally.

    3. **Byte coverage** (non-ASCII bytes in valid multi-byte sequences /
       total non-ASCII bytes) must be >= ``_CJK_MIN_BYTE_COVERAGE``.  Latin
       text has many high bytes that are NOT consumed by multi-byte pairs;
       genuine CJK text has nearly all high bytes accounted for.

    4. **Lead byte diversity**: the number of distinct lead byte values in
       valid pairs must be >= ``_CJK_MIN_LEAD_DIVERSITY``.  Genuine CJK text
       draws from a wide repertoire of lead bytes; European false positives
       cluster in a narrow band (e.g. 0xC0-0xDF for accented Latin).

    Returns the filtered candidate list.  Structural scores are cached in
    ``ctx.mb_scores`` for reuse in Stage 2b.
    N)non_ascii_count)is_multibyter   	mb_scoresname_CJK_MIN_MB_RATIOr}   len	translater   _CJK_MIN_NON_ASCIIr   mb_coverage_CJK_MIN_BYTE_COVERAGE_CJK_DIVERSITY_MIN_NON_ASCIIr   _CJK_MIN_LEAD_DIVERSITYappendtuple)rv   valid_candidatesctxgatedencmb_scorebyte_coveragelead_diversitys           rr   _gate_cjk_candidatesr      s   : !#E/c3?H&.CMM#((#++""*&)$i#dnnT:6V2W&W#""%77;c30C0CM )6COOCHH%55""&BB!<T3!L!$;;S)  * <rx   c           	     4   |D ci c]  }|j                   s|j                  | c}t        fd|D              }t        d |D              }t        t	        | g ||            }g }|D ]  }	|	j
                  r&|j                  j                  |	j
                  d      nd}
|
dk\  rB|j                  t        |	j
                  |	j                  d|
z   z  |	j                               ~|j                  |	        |j                  d d	       |S c c}w )
a  Score structurally-valid CJK candidates using statistical bigrams.

    When multiple CJK encodings score equally high structurally, statistical
    scoring differentiates them (e.g. euc-jp vs big5 for Japanese data).
    Single-byte candidates are also scored and included so that the caller
    can compare CJK vs single-byte confidence.

    Multi-byte candidates with high byte coverage (>= 0.95) receive a
    confidence boost proportional to coverage.  When nearly all non-ASCII
    bytes form valid multi-byte pairs, the structural evidence is strong
    and should increase the candidate's ranking relative to single-byte
    alternatives whose bigram models may score higher on small samples.

    Note: boosted confidence values may exceed 1.0 and are used only for
    relative ranking among candidates.  ``run_pipeline`` clamps all
    confidence values to [0.0, 1.0] before returning to callers.
    c              3  8   K   | ]  \  }}|v s|     y wNrn   )ro   r   _sc
enc_lookups      rr   rs   z/_score_structural_candidates.<locals>.<genexpr>5  s&      *;YT3tz?Q
4*;s   
c              3  :   K   | ]  }|j                   r|  y wr   )r~   )ro   es     rr   rs   z/_score_structural_candidates.<locals>.<genexpr>8  s     J#3a1>>#3s           gffffff?   r   c                    | j                   S r   )r    xs    rr   <lambda>z._score_structural_candidates.<locals>.<lambda>I  s    q||rx   Tkeyreverse)r~   r   r   listr   r   r   rt   r   r   r    r!   sort)rv   structural_scoresr   r   r   valid_mbsingle_byteresultsboostedrcoverager   s              @rr   _score_structural_candidatesr     s	   . &6H%5!&&!)%5HJ *; H J#3JJK#D*CH*C{*CDEG &(G;<::3??&&qzz373tNNZZ ||q8|<ZZ NN1  LL+TL:N- Is
   DDc                t   t        |      dkD  r|d   j                  t        |d   j                  |       rw|d   j                  }|dd D ]`  }|j                  t        v s|D cg c]  }|j                  |k7  s||us| }}|D cg c]  }|j                  |k(  s| }}|g||c S  |S c c}w c c}w )a  Demote niche Latin encodings when no distinguishing bytes are present.

    Some bigram models (e.g. iso-8859-10, iso-8859-14, windows-1254) can win
    on data that contains only bytes shared with common Western Latin
    encodings.  When there is no byte-level evidence for the winning
    encoding, promote the first common Western Latin candidate to the top and
    push the demoted encoding to last.
    r   r   N)r   r   rw   r&   )rv   r   demoted_encodingr   r   othersdemoted_entriess          rr   _demote_niche_latinr   M  s     	GqAJ+71:..5"1:..Azz44&&!!**8H*HQVWZAw   /6"XgGW9W1g"X5F5_55  N #Ys   %B0:B0?B0	B5B5c                   |r|d   j                   dk7  r|S d}t        |      D ]  \  }}|j                   dk(  s|} n ||S t        d | D              r-||   }t        |      D cg c]  \  }}||k7  s| }}}|g|S |S c c}}w )a  Promote KOI8-T over KOI8-R when Tajik-specific bytes are present.

    KOI8-T and KOI8-R share the entire 0xC0-0xFF Cyrillic letter block,
    making statistical discrimination difficult.  However, KOI8-T maps 12
    bytes in 0x80-0xBF to Tajik-specific Cyrillic letters where KOI8-R has
    box-drawing characters.  If any of these bytes appear, KOI8-T is the
    better match.
    r   zkoi8-rNzkoi8-tc              3  8   K   | ]  }|d kD  s	|t         v   ywrl   )ri   )ro   rp   s     rr   rs   z!_promote_koi8t.<locals>.<genexpr>  s     
A1D1&&s   
)r   	enumerateru   )rv   r   	koi8t_idxir   koi8t_resultr   s          rr   _promote_koi8tr   i  s     gaj))X5I'"1::!I # 

A
AAy) )' 2E 21a9n! 2E&v&&N Fs   (B6Bi   c                    |dk(  r| S 	 | j                  |d      j                  dd      S # t        t        f$ r Y yw xY w)aP  Decode data from encoding and re-encode as UTF-8 for language scoring.

    Returns None if the encoding is unknown. For UTF-8, returns data as-is.
    Uses ``errors="ignore"`` because the data already passed byte-validity
    filtering for the detected encoding; any residual invalid bytes are
    irrelevant for language scoring.
    r"   ignore)errorssurrogatepassN)decodeencodeLookupError	TypeError)rv   r   s     rr   _to_utf8r     sY     7{{8H{5<<O = 
 	
 # s   #- ??c                2   g }d}d}|D ]
  }|j                   |j                  t        |j                        }|?| r=t        |j                        r(|t	        |       }t        | |j                  |      \  }}|R| rPt        d      rEt        | |j                        }|r-||j                  dk7  rt	        |      }t        |d|      \  }}|2|j                  t        |j                  |j                  |             |j                  |        |S )a  Fill in language for results missing it.

    Tier 1: single-language encodings via hardcoded map (instant).
    Tier 2: multi-language encodings via statistical bigram scoring (lazy).
    Tier 3: decode to UTF-8, score against UTF-8 language models (universal fallback).
    N)profiler"   r   )
r!   r   r   r   r   r	   r   r   r   r    )	rv   r   filledr   utf8_profileresultlang_	utf8_datas	            rr   _fill_languager     s    %'F$(G)-L??"v'B!&//2D|);FOO)L?+D1G-dFOOWU4|);G)D$T6??;	#+v'/I'4Y'?1!7LGAt #!'#)#4#4!% f7 8 Mrx   c                J    t        | |      }t        | |      }t        | |      S )zGApply confusion resolution, niche Latin demotion, and KOI8-T promotion.)r   r   r   )rv   r   s     rr   _postprocess_resultsr     s)    
 'tW5G!$0G$((rx   c                   t               }| d| } | st        gS t        |       }||gS t        |       }||gS t	        |       }|?|j
                  3t        j                  |j
                        }|||j                  z  r|gS t        |       }|t        | |      rt        gS t        |       }	|	|	gS t        |       }
|
|
gS ||gS t        |      }t        | |      }|st         gS t#        | ||      }|st         gS g }|D ]f  }|j$                  s|j&                  j                  |j(                        }|t+        | ||      }|dkD  sJ|j-                  |j(                  |f       h |r?|j/                  d d       |d   \  }}|t0        k\  rt3        | |||      }t5        | |      S t7        t9        | t;        |                  }|st         gS t5        | |      S )zBCore pipeline logic. Returns list of results sorted by confidence.N)	max_bytesr   c                    | d   S )Nr   rn   r   s    rr   r   z$_run_pipeline_core.<locals>.<lambda>0  s    QqTrx   Tr   r   )r   _EMPTY_RESULTr   r   r   r   r   rt   erar   r   _BINARY_RESULTr   r   r   r   _FALLBACK_RESULTr   r~   r   r   r   r   r    _STRUCTURAL_CONFIDENCE_THRESHOLDr   r   r   r   r   )rv   encoding_erar   r   
bom_resultutf1632_resultescape_resultenc_infoutf8_precheckmarkup_resultascii_result
candidatesr   r   r   scorer   
best_scorer   s                      rr   _run_pipeline_corer     s9    
C
D D!J|
 -T2N! +40M ]%;%;%G<< 6 67|hll:!?"  %M 49!E
 *$/M   %L~    -J)$
; !! ,D2BCH !! 24MM%%chh/E}0sC@s{!((#((E):;   >4@)!,:992')93G (g66 #D%0@*ABCG !!g..rx   c           
        t        | ||      }t        | dt         |      }|sd}t        |      |D cg c]H  }|j                  dkD  r5t        |j                  t        |j                  d      |j                        n|J c}S c c}w )a,  Run the full detection pipeline.

    :param data: The raw byte data to analyze.
    :param encoding_era: Filter candidates to a specific era of encodings.
    :param max_bytes: Maximum number of bytes to process.
    :returns: A list of :class:`DetectionResult` sorted by confidence descending.
    Nz/pipeline must always return at least one resultg      ?)	r   r   _LANG_SCORE_MAX_BYTESRuntimeErrorr    r   r   minr!   )rv   r   r   r   msgr   s         rr   run_pipeliner   @  s     !|Y?G T"8#897CG?3 	 A <<# 	

Cc$:AJJG	 	  s   AB)r   strrv   bytesreturnbool)rv   r   r   tuple[EncodingInfo, ...]r   r   r   r   )
rv   r   r   zlist[tuple[str, float]]r   r   r   r   r   list[DetectionResult])rv   r   r   r   r   r   )rv   r   r   r   r   zbytes | None)rv   r   r   r   r   intr   r   )I__doc__
__future__r   chardet._utilsr   chardet.enumsr   chardet.modelsr   r   r   r	   chardet.pipeliner
   r   r   r   chardet.pipeline.asciir   chardet.pipeline.binaryr   chardet.pipeline.bomr   chardet.pipeline.confusionr   chardet.pipeline.escaper   chardet.pipeline.markupr   chardet.pipeline.statisticalr   chardet.pipeline.structuralr   r   r   chardet.pipeline.utf8r   chardet.pipeline.utf1632r   chardet.pipeline.validityr   chardet.registryr   r   r   r   r   r   r   	frozensetr&   __annotations__rU   r\   r_   r`   ri   rw   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rn   rx   rr   <module>r     s   F " , %   0 - + ? : 9 9 
 . < 8 C C 6
  TDQ #t 
 $(   +4+   /8/1/ ^ 1n /8 "/ ^ "T 09(0 n  /.03 /  *3L*  
B$          " 3
3.3 
3 	3l-
-.- /- 
	-
 -`
" 8
" B  $(
(/((V)
)") ) 'i/
i/i/ i/ 	i/^ '
  	rx   