
    i
                        d Z ddlmZ ddlZddlZddlmZmZ dZ ej                  dej                        Z ej                  dej                        Z ej                  dej                        Zdd	Zdd
ZddZy)z2Stage 1b: HTML/XML charset declaration extraction.    )annotationsN)DETERMINISTIC_CONFIDENCEDetectionResulti   s*   <\?xml[^>]+encoding\s*=\s*['"]([^'"]+)['"]s,   <meta[^>]+charset\s*=\s*['"]?\s*([^\s'">;]+)s6   <meta[^>]+content\s*=\s*['"][^'"]*charset=([^\s'">;]+)c                    	 | j                  d      j                         j                         }t        j                  |       |S # t
        t        t        f$ r Y yw xY w)aQ  Validate encoding name via codecs and return the lowercased original name.

    We use ``codecs.lookup()`` to verify the encoding is recognized by Python,
    but return the original (lowercased) name rather than the codec's canonical
    name so that common aliases like ``iso-8859-1`` and ``windows-1252`` are
    preserved as-is.
    asciiN)decodestriplowercodecslookupLookupErrorUnicodeDecodeError
ValueError)nametexts     g/Users/bowang/.openclaw/workspace/ChatDev/.venv/lib/python3.12/site-packages/chardet/pipeline/markup.py_normalize_encodingr      sV    {{7#))+113d  +Z8 s   AA AAc                    | sy| dt          }t        t        t        fD ]S  }|j	                  |      }|st        |j                  d            }|4t        | |      sAt        |t        d      c S  y)a`  Scan the first bytes of *data* for an HTML/XML charset declaration.

    Checks for:

    1. ``<?xml ... encoding="..."?>``
    2. ``<meta charset="...">``
    3. ``<meta http-equiv="Content-Type" content="...; charset=...">``

    :param data: The raw byte data to scan.
    :returns: A :class:`DetectionResult` with confidence 0.95, or ``None``.
    N   )encoding
confidencelanguage)
_SCAN_LIMIT_XML_ENCODING_RE_HTML5_CHARSET_RE_HTML4_CONTENT_TYPE_REsearchr   group_validate_bytesr   r   )dataheadpatternmatchr   s        r   detect_markup_charsetr$   (   sw     D$&79OPt$*5;;q>:H#h(G&%7!  Q     c                `    	 | dt          j                  |       y# t        t        f$ r Y yw xY w)zCheck that *data* can be decoded under *encoding* without errors.

    Only validates the first ``_SCAN_LIMIT`` bytes to avoid decoding a
    full 200 kB input just to verify a charset declaration found in the
    header.
    NFT)r   r   r   r   )r    r   s     r   r   r   G   s:    \k!!(+  , s    --)r   bytesreturnz
str | None)r    r'   r(   zDetectionResult | None)r    r'   r   strr(   bool)__doc__
__future__r   r   rechardet.pipeliner   r   r   compile
IGNORECASEr   r   r   r   r$   r    r%   r   <module>r2      sz    8 "  	 F2::6  BJJ8"--  $BBMM 
">r%   