+
    i!              	          R t ^ RIt^ RIHtHtHtHt ^RIHt ]	! 0 RkRkRkRkRkR	kR
kRkRkRkRkRkRkRkRkRkRkRkRkRkRkRkRkRkRkRkRkRkR kR!kR"kR#kR$kR%kR&kR'kR(kR)kR*kR+kR,kR-kR.kR/kR0kR1k4      t
R2 R3 ltRXR4 R5 lltR6 R7 lt]]P                  ]P                  ]P                   ]P"                  ]P$                  ]P&                  ]P(                  ]P*                  3,          tR8 R9 ltR: R; ltR< R= ltR> R? ltR@ RA ltRYRB RC lltRYRD RE lltRYRF RG lltRYRH RI lltRYRJ RK llt RYRL RM llt!RYRN RO llt"RYRP RQ llt#RYRR RS llt$RTRU/RV RW llt%R# )Zz.Near-duplicate detection for last30days skill.N)ListSetTupleUnion)schematheaantoforhowisinofonandwithfrombyatthisthatitmyyourimeweyouwhataredocanitsbeornotnosoifbutaboutalljustgethashavewaswillshowhnc                0    V ^8  d   QhR\         R\         /#    textreturn)str)formats   "R/Users/bowang/.openclaw/workspace/skills/last30days-official/scripts/lib/dedupe.py__annotate__r=      s     
 
 
 
    c                    V P                  4       p \        P                  ! RRV 4      p \        P                  ! RRV 4      p V P                  4       # )zWNormalize text for comparison.

- Lowercase
- Remove punctuation
- Collapse whitespace
[^\w\s] z\s+)lowerresubstrip)r8   s   &r<   normalize_textrF      s?     ::<D66*c4(D66&#t$D::<r>   c                R    V ^8  d   QhR\         R\        R\        \         ,          /# )r7   r8   nr9   )r:   intr   )r;   s   "r<   r=   r=      s%     ; ;S ;S ;S ;r>   c                    \        V 4      p \        V 4      V8  d   V 0# \        \        V 4      V,
          ^,           4       Uu0 uF  q W"V,            kK  	  up# u upi )z Get character n-grams from text.)rF   lenrange)r8   rH   r   s   && r<   
get_ngramsrM      sP    $D
4y1}v!&s4y1}q'8!9:!9AQ3K!9:::s   Ac                h    V ^8  d   QhR\         \        ,          R\         \        ,          R\        /# )r7   set1set2r9   )r   r:   float)r;   s   "r<   r=   r=   '   s)     6 6SX 6SX 6% 6r>   c                    V '       d	   V'       g   R# \        W,          4      p\        W,          4      pV^ 8  d	   W#,          # R# )z,Compute Jaccard similarity between two sets.        )rK   )rO   rP   intersectionunions   &&  r<   jaccard_similarityrV   '   s6    tt{#LE#(19<5#5r>   c                0    V ^8  d   QhR\         R\        /# r7   itemr9   AnyItemr:   )r;   s   "r<   r=   r=   4   s       C r>   c                   \        V \        P                  4      '       d   V P                  # \        V \        P                  4      '       d   V P                  # \        V \        P
                  4      '       d   V P                   RV P                   2# \        V \        P                  4      '       d   V P                   RV P                   2# \        V \        P                  4      '       d   V P                   RV P                   2# \        V \        P                  4      '       d   V P                   RV P                   2# \        V \        P                  4      '       d   V P                  # V P                  # )z!Get comparable text from an item.rA   )
isinstancer   
RedditItemtitleHackerNewsItemYouTubeItemchannel_name
TikTokItemr8   author_nameInstagramItemPolymarketItemquestionWebSearchItem)rY   s   &r<   get_item_textri   4   s   $))**zz	D&//	0	0zz	D&,,	-	-**Qt00122	D&++	,	,))Ad../00	D&..	/	/))Ad../00	D&//	0	0**Qt}}o..	D&..	/	/zzyyr>   c                0    V ^8  d   QhR\         R\        /# rX   rZ   )r;   s   "r<   r=   r=   H   s       S r>   c                   \        V \        P                  4      '       d   V P                  R,          # \        V \        P                  4      '       d   V P                  R,          # \        V \        P
                  4      '       d   V P                  R,          # \        V \        P                  4      '       dm   V P                  pVP                  R4      '       d   VR,          P                  4       pV# VP                  R4      '       d   VR,          P                  4       pV# \        V \        P                  4      '       d   V P                  # \        V 4      # )zGet text for cross-source comparison.

Same as get_item_text() but truncates X posts to 100 chars
to level the playing field against short Reddit/HN titles.
Strips 'Show HN:' prefix from HN titles for fairer matching.
:Nd   NzShow HN::   NNzAsk HN::   NN)r]   r   XItemr8   rc   re   r`   r_   
startswithrE   rf   ri   )rY   r_   s   & r<   _get_cross_source_textrq   H   s     $%%yy$))**yy$,,--yy$--..

J''"IOO%E  i(("IOO%E$--..zzr>   c                F    V ^8  d   QhR\         R\        \         ,          /# r6   )r:   r   )r;   s   "r<   r=   r=   a   s"     C CS CSX Cr>   c                    \         P                  ! RRV P                  4       4      P                  4       pV Uu0 uF#  q"\        9  g   K  \        V4      ^8  g   K!  VkK%  	  up# u upi )z8Tokenize text for cross-source token Jaccard comparison.r@   rA   )rC   rD   rB   split	STOPWORDSrK   )r8   wordsws   &  r<   _tokenize_for_xrefrx   a   sN    FF:sDJJL1779EBu! 2As1vzAAuBBBs   A%
A%A%c                <    V ^8  d   QhR\         R\         R\        /# r7   text_atext_br9   r:   rQ   )r;   s   "r<   r=   r=   g   s!     2 23 2 2 2r>   c                    \        V 4      p\        V4      pV'       d	   V'       g   R# \        W#,          4      p\        W#,          4      pV'       d	   WE,          # R# )z.Token-level Jaccard similarity (word overlap).rS   )rx   rK   )r{   r|   tokens_atokens_brT   rU   s   &&    r<   _token_jaccardr   g   sH    !&)H!&)H8x*+L#$E#(<1c1r>   c                <    V ^8  d   QhR\         R\         R\        /# rz   r}   )r;   s   "r<   r=   r=   r   s!     ' 's 'C 'E 'r>   c                j    \        \        V 4      \        V4      4      p\        W4      p\        W#4      # )zAHybrid similarity: max of char-trigram Jaccard and token Jaccard.)rV   rM   r   max)r{   r|   trigram_sim	token_sims   &&  r<   _hybrid_similarityr   r   s-    $Z%7F9KLKv.I{&&r>   c          	          V ^8  d   QhR\         \        \        P                  \        P                  3,          ,          R\
        R\         \        \        \        3,          ,          /# r7   items	thresholdr9   )r   r   r   r^   ro   rQ   r   rI   )r;   s   "r<   r=   r=   y   sL      f''567 
%S/r>   c                :   . pV  Uu. uF  p\        \        V4      4      NK  	  pp\        \        V 4      4       FX  p\        V^,           \        V 4      4       F5  p\	        WE,          WF,          4      pWq8  g   K#  VP                  WV34       K7  	  KZ  	  V# u upi )zFind near-duplicate pairs in items.

Args:
    items: List of items to check
    threshold: Similarity threshold (0-1)

Returns:
    List of (i, j) index pairs where i < j and items are similar
)rM   ri   rL   rK   rV   append)r   r   
duplicatesrY   ngramsr   j
similaritys   &&      r<   find_duplicatesr   y   s     J ;@@%$jt,-%F@3u:q1uc%j)A+FIvyAJ&!!1&) *   As   Bc          	          V ^8  d   QhR\         \        \        P                  \        P                  3,          ,          R\
        R\         \        \        P                  \        P                  3,          ,          /# r   )r   r   r   r^   ro   rQ   )r;   s   "r<   r=   r=      s]     L Lf''567LL 
%!!6<</
01Lr>   c                V   \        V 4      ^8:  d   V # \        W4      p\        4       pV FP  w  rEW,          P                  W,          P                  8  d   VP	                  V4       K?  VP	                  V4       KR  	  \        V 4       UUu. uF  w  rgWc9  g   K  VNK  	  upp# u uppi )zRemove near-duplicates, keeping highest-scored item.

Args:
    items: List of items (should be pre-sorted by score descending)
    threshold: Similarity threshold

Returns:
    Deduplicated items
)rK   r   setscoreadd	enumerate)r   r   	dup_pairs	to_remover   r   idxrY   s   &&      r<   dedupe_itemsr      s     5zQ  1I I8>>UX^^+MM!MM!  #,E"2K"2YSc6JDD"2KKKs   
B%B%c                    V ^8  d   QhR\         \        P                  ,          R\        R\         \        P                  ,          /# r   )r   r   r^   rQ   )r;   s   "r<   r=   r=      <     * *!!"** 
&

*r>   c                    \        W4      # )zDedupe Reddit items.r   r   r   s   &&r<   dedupe_redditr          
 ))r>   c                    V ^8  d   QhR\         \        P                  ,          R\        R\         \        P                  ,          /# r   )r   r   ro   rQ   )r;   s   "r<   r=   r=      s8     * *** 
&,,*r>   c                    \        W4      # )zDedupe X items.r   r   s   &&r<   dedupe_xr      r   r>   c                    V ^8  d   QhR\         \        P                  ,          R\        R\         \        P                  ,          /# r   )r   r   ra   rQ   )r;   s   "r<   r=   r=      s<     * *""#** 
&

*r>   c                    \        W4      # )zDedupe YouTube items.r   r   s   &&r<   dedupe_youtuber      r   r>   c                    V ^8  d   QhR\         \        P                  ,          R\        R\         \        P                  ,          /# r   )r   r   rc   rQ   )r;   s   "r<   r=   r=      r   r>   c                    \        W4      # )zDedupe TikTok items.r   r   s   &&r<   dedupe_tiktokr      r   r>   c                    V ^8  d   QhR\         \        P                  ,          R\        R\         \        P                  ,          /# r   )r   r   re   rQ   )r;   s   "r<   r=   r=      s<     * *$$%** 
&

*r>   c                    \        W4      # )zDedupe Instagram items.r   r   s   &&r<   dedupe_instagramr      r   r>   c                    V ^8  d   QhR\         \        P                  ,          R\        R\         \        P                  ,          /# r   )r   r   r`   rQ   )r;   s   "r<   r=   r=      <     * *%%&** 
&

 *r>   c                    \        W4      # )zDedupe Hacker News items.r   r   s   &&r<   dedupe_hackernewsr      r   r>   c                    V ^8  d   QhR\         \        P                  ,          R\        R\         \        P                  ,          /# r   )r   r   rf   rQ   )r;   s   "r<   r=   r=      r   r>   c                    \        W4      # )zDedupe Polymarket items.r   r   s   &&r<   dedupe_polymarketr      r   r>   r   g?c                J    V ^8  d   QhR\         \        ,          R\        RR/# )r7   source_listsr   r9   N)r   r[   rQ   )r;   s   "r<   r=   r=      s/     %D %D=%D%D 
%Dr>   c                   . pV F  pVP                  V4       K  	  \        V4      ^8:  d   R# V Uu. uF  p\        V4      NK  	  pp\        \        V4      4       EF  p\        V^,           \        V4      4       F  p\	        W&,          4      \	        W',          4      J d   K)  \        WV,          WW,          4      pW8  g   KI  W',          P                  W&,          P                  9  d2   W&,          P                  P                  W',          P                  4       W&,          P                  W',          P                  9  g   K  W',          P                  P                  W&,          P                  4       K  	  EK"  	  R# u upi )a  Annotate items with cross-source references.

Compares items across different source types using hybrid similarity
(max of char-trigram Jaccard and token Jaccard). When similarity exceeds
threshold, adds bidirectional cross_refs with the related item's ID.
Modifies items in-place.

Args:
    *source_lists: Variable number of per-source item lists
    threshold: Similarity threshold for cross-linking (default 0.40)
N)	extendrK   rq   rL   typer   id
cross_refsr   )	r   r   	all_itemssource_listrY   textsr   r   r   s	   $*       r<   cross_source_linkr      s    I#% $ 9~ 7@@id#D)iE@3y>"q1uc)n-AIL!T),%77+EHeh?J&<??),*A*AAL++229<??C<??),*A*AAL++229<??C . # As   F)   )gffffff?)&__doc__rC   typingr   r   r   r    r   	frozensetru   rF   rM   rV   r^   ro   ra   rc   re   r`   rf   rh   r[   ri   rq   rx   r   r   r   r   r   r   r   r   r   r   r   r    r>   r<   <module>r      s   4 	 * *   	!#(*.046:<@	!%'-/57;=A     # %+ -2 48 :? 
	 	 	 	 #	 %)	 +/	 16	 8?	
 

 
 
  
 "(
 */
 17
 9?
 AE 	
;6 !!6<<1C1CVEVEV$$f&;&;V=R=RTZThThi j(2C2'6LB*******%D%D %Dr>   