
    i*                    |   U d Z ddlmZ ddlZddlZddlmZ dRdZi d eddh      d	 ed
dh      d
 edh      d edh      d edh      d eddh      d eddh      d edh      d edh      d edh      d edh      d edh      d eh d      d ed d!h      d" ed#h      d$ edh      d% ed&h       ed'h       ed(h       ed)h       ed*h       ed+h       ed,h      d-Z	d.e
d/<   d0d1d0d2d3d4d5d6d7d8d9d8d:Zd;e
d<<   	 	 	 	 dSd=Zd>Zd?e
d@<   dAZd?e
dB<   dTdCZ e       Zd.e
dD<   dUdEZe	j%                         D  ci c]  \  } } e|        edF |D               c}} Zd.e
dG<   dTdHZ e       Zd.e
dI<   dVdJZdWdKZ edLdMh      ZdNe
dO<   dXdPZ	 	 	 	 	 	 	 	 dYdQZyc c}} w )ZaA  Encoding equivalences and legacy name remapping.

This module defines:

1. **Directional supersets** for accuracy evaluation: detecting a superset
   encoding when the expected encoding is a subset is correct (e.g., detecting
   utf-8 when expected is ascii), but not the reverse.

2. **Bidirectional equivalents**: groups of encodings where detecting any
   member when another member was expected is considered correct.  This
   includes UTF-16/UTF-32 endian variants (which encode the same text with
   different byte order) and ISO-2022-JP branch variants (which are
   compatible extensions of the same base encoding).

3. **Preferred superset mapping** for the ``should_rename_legacy`` API option:
   replaces detected ISO/subset encoding names with their Windows/CP superset
   equivalents that modern software actually uses.
    )annotationsN)DetectionDictc                    	 t        j                  |       j                  S # t        $ r3 | j	                         j                  dd      j                  dd      cY S w xY w)zNormalize encoding name for comparison.

    :param name: The encoding name to normalize.
    :returns: The canonical codec name, or a lowered/stripped fallback.
    - _)codecslookupnameLookupErrorlowerreplace)r   s    d/Users/bowang/.openclaw/workspace/ChatDev/.venv/lib/python3.12/site-packages/chardet/equivalences.pynormalize_encoding_namer      sQ    >}}T"''' >zz|##C,44S"==>s   ! 9AAasciizutf-8zwindows-1252tis-620iso-8859-11cp874gb2312gb18030gbkbig5	big5hkscscp950	shift_jiscp932shift_jis_2004zshift-jisx0213zeuc-jpzeuc-jis-2004zeuc-jisx0213euc-krcp949cp037cp1140ziso-2022-jp>   iso2022-jp-2iso2022-jp-extiso2022-jp-2004ziso2022-jp-1r"   r#   ziso2022-jp-3r$   
iso-8859-1
iso-8859-2zwindows-1250zwindows-1251zwindows-1256zwindows-1253zwindows-1255zwindows-1254zwindows-1257)
iso-8859-5
iso-8859-6
iso-8859-7
iso-8859-8
iso-8859-9iso-8859-13dict[str, frozenset[str]]	SUPERSETSzWindows-1252CP949zWindows-1250zWindows-1251zWindows-1256zWindows-1253zWindows-1255zWindows-1254CP874zWindows-1257)r   r   r%   r&   r'   r(   r)   r*   r+   r   r,   r   zdict[str, str]PREFERRED_SUPERSETc                    | j                  d      }t        |t              r't        j                  |j	                         |      | d<   | S )a7  Replace the encoding name with its preferred Windows/CP superset.

    Modifies the ``"encoding"`` value in *result* in-place and returns *result*
    for fluent chaining.

    :param result: A detection result dict containing an ``"encoding"`` key.
    :returns: The same *result* dict, modified in-place.
    encoding)get
isinstancestrr1   r   )resultencs     r   apply_legacy_renamer9   b   s>     **Z
 C#s/33CIIKEzM    ))zutf-16z	utf-16-lez	utf-16-be)zutf-32z	utf-32-lez	utf-32-be)r"   r$   r#   ztuple[tuple[str, ...], ...]BIDIRECTIONAL_GROUPS))skcs)ukrubgbe)msid)nodasvLANGUAGE_EQUIVALENCESc                 N    i } t         D ]  }t        |      }|D ]  }|| |<   	  | S )zBBuild a lookup: ISO code -> frozenset of all equivalent ISO codes.)rG   	frozenset)r7   group	group_setcodes       r   _build_language_equiv_indexrM      s5    (*F&e$	D$F4L  ' Mr:   _LANGUAGE_EQUIVc                L    | |k(  ryt         j                  |       }|duxr ||v S )a  Check whether *detected* is an acceptable language for *expected*.

    Returns ``True`` when *expected* and *detected* are the same ISO 639-1
    code, or belong to the same equivalence group in
    :data:`LANGUAGE_EQUIVALENCES`.

    :param expected: Expected ISO 639-1 language code.
    :param detected: Detected ISO 639-1 language code.
    :returns: ``True`` if the languages are equivalent.
    TN)rN   r4   )expecteddetectedrJ   s      r   is_language_equivalentrR      s4     8)E2U!22r:   c              3  2   K   | ]  }t        |        y wNr   ).0ss     r   	<genexpr>rX      s      /,5q"I   _NORMALIZED_SUPERSETSc                 n    i } t         D ])  }t        d |D              }|D ]  }|| t        |      <    + | S )z1Build the bidirectional equivalence lookup index.c              3  2   K   | ]  }t        |        y wrT   rU   )rV   ns     r   rX   z%_build_bidir_index.<locals>.<genexpr>   s     Eu!215urY   )r;   rI   r   )r7   rJ   normedr   s       r   _build_bidir_indexr_      sB    (*F%EuEED4:F*401  & Mr:   _NORMALIZED_BIDIRc                    | |du S |yt        |       }t        |      }||k(  ry|t        v r|t        |   v ry|t        v xr |t        |   v S )a  Check whether *detected* is an acceptable answer for *expected*.

    Acceptable means:

    1. Exact match (after normalization), OR
    2. Both belong to the same bidirectional byte-order group, OR
    3. *detected* is a known superset of *expected*.

    :param expected: The expected encoding name, or ``None`` for binary files.
    :param detected: The detected encoding name, or ``None``.
    :returns: ``True`` if the detection is acceptable.
    NFT)r   r`   rZ   )rP   rQ   norm_expnorm_dets       r   
is_correctrd      s}     4&x0H&x0H 8 $$5Fx5P)P 	)) 	8-h77r:   c                ^    t        j                  d|       }dj                  d |D              S )z4NFKD-normalize *text* and strip all combining marks.NFKDr   c              3  L   K   | ]  }t        j                  |      r|  y wrT   )unicodedata	combining)rV   cs     r   rX   z#_strip_combining.<locals>.<genexpr>   s     Cd+*?*?*B1ds   $$)rh   	normalizejoin)textnfkds     r   _strip_combiningro      s)      .D77CdCCCr:   )   ¤   €)rq   rp   zfrozenset[tuple[str, str]]_EQUIVALENT_SYMBOL_PAIRSc                R    | |k(  ry| |ft         v ryt        |       t        |      k(  S )u   Return True if characters *a* and *b* are functionally equivalent.

    Equivalent means:
    - Same character, OR
    - Same base letter after stripping combining marks, OR
    - An explicitly listed symbol equivalence (e.g. ¤ ↔ €)
    T)rr   ro   )abs     r   _chars_equivalentrv      s4     	Av	1v))A"21"555r:   c                4   ||du S |yt        |      }t        |      }||k(  ry	 | j                  |      }| j                  |      }||k(  ryt	        |      t	        |      k7  ryt        d t        ||d      D              S # t        t        f$ r Y yw xY w)u  Check whether *detected* produces functionally identical text to *expected*.

    Returns ``True`` when:

    1. *detected* is not ``None`` and both encoding names normalize to the same
       codec, OR
    2. Decoding *data* with both encodings yields identical strings, OR
    3. Every differing character pair is functionally equivalent: same base
       letter after stripping combining marks, or an explicitly listed symbol
       equivalence (e.g. ¤ ↔ €).

    Returns ``False`` if *detected* is ``None``, either encoding is unknown,
    or either encoding cannot decode *data*.

    :param data: The raw byte data that was detected.
    :param expected: The expected encoding name, or ``None`` for binary files.
    :param detected: The detected encoding name, or ``None``.
    :returns: ``True`` if decoding with *detected* yields functionally identical
        text to decoding with *expected*.
    NFTc              3  :   K   | ]  \  }}t        ||        y wrT   )rv   )rV   rt   ru   s      r   rX   z*is_equivalent_detection.<locals>.<genexpr>4  s     X3W41a A&3Ws   )strict)r   decodeUnicodeDecodeErrorr   lenallzip)datarP   rQ   rb   rc   text_exptext_dets          r   is_equivalent_detectionr     s    . 4&x0H&x0H8;;x(;;x( 8
8}H%X3xRV3WXXX , s   "B BB)r   r6   returnr6   )r7   r   r   r   )r   r-   )rP   r6   rQ   r6   r   bool)rP   
str | NonerQ   r   r   r   )rm   r6   r   r6   )rt   r6   ru   r6   r   r   )r   bytesrP   r   rQ   r   r   r   )__doc__
__future__r   r	   rh   chardet.pipeliner   r   rI   r.   __annotations__r1   r9   r;   rG   rM   rN   rR   itemsrZ   r_   r`   rd   ro   rr   rv   r   )subset	supersetss   00r   <module>r      s!  & #   *	>*(Y01(y-12( 9gY'( i$	(
 
9i[!( I{G,-( G%567( i!1 23( i()( I~./( i	"( Yz"( 9RS( I~/?@A(  I012!($ )^,-%(& )^,-'(( ^,-^,-^,-^,-^,-n-.3(	$ D        !& N  $5 1 6 2 " .I-J* J3, '__.	4 /	 F#Y /,5/ &  /	4 0  0B/C , C FD 8A8 4 6 .Y
.Y%.Y1;.Y	.Yq4s    F8