
    PL
jD                        d Z ddlmZ ddlZddlZddlZddlmZ  ej        e	          Z
 ej        d          Zdd	ZddZd dZd!dZd"d#dZddZd dZd$dZd dZddZg dZdS )%at  Message and tool-payload sanitization helpers.

Pure functions extracted from ``run_agent.py`` so the AIAgent module can
stay focused on the conversation loop.  These walk OpenAI-format message
lists and structured payloads, repairing or stripping problematic
characters that would otherwise crash ``json.dumps`` inside the OpenAI
SDK or be rejected by upstream APIs.

All helpers are stateless and side-effect-free except for in-place
mutation of their input (where documented).  Backward-compatible
re-exports from ``run_agent`` remain in place so existing imports
``from run_agent import _sanitize_surrogates`` keep working.
    )annotationsN)Anyz[\ud800-\udfff]textstrreturnc                p    t                               |           rt                               d|           S | S )zReplace lone surrogate code points with U+FFFD (replacement character).

    Surrogates are invalid in UTF-8 and will crash ``json.dumps()`` inside the
    OpenAI SDK.  This is a fast no-op when the text contains no surrogates.
       �)_SURROGATE_REsearchsubr   s    >/home/kuhnn/.hermes/hermes-agent/agent/message_sanitization.py_sanitize_surrogatesr      s5     D!! 1  4000K    payloadr   boolc                0    dfd |            S )uu  Replace surrogate code points in nested dict/list payloads in-place.

    Mirror of ``_sanitize_structure_non_ascii`` but for surrogate recovery.
    Used to scrub nested structured fields (e.g. ``reasoning_details`` — an
    array of dicts with ``summary``/``text`` strings) that flat per-field
    checks don't reach.  Returns True if any surrogates were replaced.
    Fc                   t          | t                    r|                                 D ]|\  }}t          |t                    r;t                              |          r t                              d|          | |<   dUt          |t          t          f          r |           }d S t          | t                    rt          |           D ]~\  }}t          |t                    r;t                              |          r t                              d|          | |<   dUt          |t          t          f          r |           }d S d S )Nr	   T)	
isinstancedictitemsr   r
   r   r   list	enumerate)nodekeyvalueidx_walkfounds       r   r   z-_sanitize_structure_surrogates.<locals>._walk4   sQ   dD!! 	!"jjll ! !
UeS)) !$++E22 %$1$5$5h$F$FS	 $d|44 !E%LLL! ! d## 	!'oo ! !
UeS)) !$++E22 %$1$5$5h$F$FS	 $d|44 !E%LLL	! 	!! !r    r   r   r   s    @@r   _sanitize_structure_surrogatesr"   *   s=     E! ! ! ! ! !& 
E'NNNLr   messagesr   c                p   d}| D ]}t          |t                    s|                    d          }t          |t                    r;t                              |          r!t                              d|          |d<   d}nt          |t                    r~|D ]{}t          |t                    rd|                    d          }t          |t                    r:t                              |          r t                              d|          |d<   d}||                    d          }t          |t                    r:t                              |          r t                              d|          |d<   d}|                    d          }t          |t                    rs|D ]o}t          |t                    s|                    d          }	t          |	t                    r:t                              |	          r t                              d|	          |d<   d}|                    d	          }
t          |
t                    r|
                    d          }t          |t                    r:t                              |          r t                              d|          |
d<   d}|
                    d
          }t          |t                    r:t                              |          r t                              d|          |
d
<   d}q|                                D ]\  }}|dv r
t          |t                    r;t                              |          r t                              d|          ||<   d}Zt          |t          t          f          rt          |          rd}|S )a  Sanitize surrogate characters from all string content in a messages list.

    Walks message dicts in-place. Returns True if any surrogates were found
    and replaced, False otherwise. Covers content/text, name, tool call
    metadata/arguments, AND any additional string or nested structured fields
    (``reasoning``, ``reasoning_content``, ``reasoning_details``, etc.) so
    retries don't fail on a non-content field.  Byte-level reasoning models
    (xiaomi/mimo, kimi, glm) can emit lone surrogates in reasoning output
    that flow through to ``api_messages["reasoning_content"]`` on the next
    turn and crash json.dumps inside the OpenAI SDK.
    Fcontentr	   Tr   name
tool_callsidfunction	arguments>   r&   roler%   r'   )
r   r   getr   r
   r   r   r   r   r"   )r#   r   msgr%   partr   r&   r'   tctc_idfnfn_namefn_argsr   r   s                  r   _sanitize_messages_surrogatesr4   K   sw    E 3! 3!#t$$ 	'')$$gs## 		%(<(<W(E(E 		%*..xAAC	NEE&& 	% % %dD)) %88F++D!$,, %1E1Ed1K1K %'4'8'84'H'HV $wwvdC   	]%9%9$%?%? 	'++Hd;;CKEWW\**
j$'' 	%  % %!"d++ teS)) !m.B.B5.I.I !,005AABtH EVVJ''b$'' % ffVnnG!'3// %M4H4H4Q4Q %%2%6%6x%I%I6
 $ ff[11G!'3// %M4H4H4Q4Q %*7*;*;Hg*N*N; $ ))++ 		! 		!JC???%%% ! ''.. !,005AACH EED$<00 !1%88 ! E		! Lr   rawc                2   g }d}d}t          |           }||k     r| |         }|r|dk    rB|dz   |k     r9|                    |           |                    | |dz                       |dz  }X|dk    rd}|                    |           nmt          |          dk     r'|                    dt          |          d	           n3|                    |           n|dk    rd
}|                    |           |dz  }||k     d                    |          S )ug  Escape unescaped control chars inside JSON string values.

    Walks the raw JSON character-by-character, tracking whether we are
    inside a double-quoted string. Inside strings, replaces literal
    control characters (0x00-0x1F) that aren't already part of an escape
    sequence with their ``\uXXXX`` equivalents. Pass-through for everything
    else.

    Ported from #12093 — complements the other repair passes in
    ``_repair_tool_call_arguments`` when ``json.loads(strict=False)`` is
    not enough (e.g. llama.cpp backends that emit literal apostrophes or
    tabs alongside other malformations).
    Fr   \      "    z\u04xT )lenappendordjoin)r5   out	in_stringinchs         r   %_escape_invalid_chars_in_json_stringsrG      s+    CI	ACA
a%%V 	Tzza!eaii

2

3q1u:&&&QSyy!	

2R4

.R...////

2Syy 	JJrNNN	Q) a%%* 773<<r   ?raw_args	tool_namec                z   t          | t                    r|                                 nd}|st                              d|           dS |dk    rt                              d|           dS 	 t          j        |d          }t          j        |d	          }||k    rt                              d
|           |S # t
          j        t          t          f$ r Y nw xY w|}t          j        dd|          }|                    d          |                    d          z
  }|                    d          |                    d          z
  }|dk    r|d|z  z  }|dk    r|d|z  z  }t          d          D ]}	 t          j        |            n# t
          j        $ r |                    d          r7|                    d          |                    d          k    r|dd         }nO|                    d          r7|                    d          |                    d          k    r|dd         }nY  nY w xY w	 t          j        |           t                              d||dd         |dd                    |S # t
          j        $ r Y nw xY w	 t!          |          }	|	|k    rCt          j        |	           t                              d||dd         |	dd                    |	S n"# t
          j        t          t          f$ r Y nw xY wt                              d||dd                    dS )a  Attempt to repair malformed tool_call argument JSON.

    Models like GLM-5.1 via Ollama can produce truncated JSON, trailing
    commas, Python ``None``, etc.  The API proxy rejects these with HTTP 400
    "invalid tool call arguments".  This function applies common repairs;
    if all fail it returns ``"{}"`` so the request succeeds (better than
    crashing the session).  All repairs are logged at WARNING level.
    r=   z*Sanitized empty tool_call arguments for %sz{}Nonez0Sanitized Python-None tool_call arguments for %sF)strict),:)
separatorsz>Repaired unescaped control chars in tool_call arguments for %sz,\s*([}\]])z\1{}[]r   2   Nu8   Repaired malformed tool_call arguments for %s: %s → %sP   uA   Repaired control-char-laced tool_call arguments for %s: %s → %suP   Unrepairable tool_call arguments for %s — replaced with empty object (was: %s))r   r   striploggerwarningjsonloadsdumpsJSONDecodeError	TypeError
ValueErrorrer   countrangeendswithrG   )
rI   rJ   raw_strippedparsedreserialisedfixed
open_curlyopen_bracket_escapeds
             r   _repair_tool_call_argumentsrm      s    (2(C'@'@H8>>###bL  CYOOOt vI9UUUt
L777z&Z@@@<''NNP    )Z8    EF>5%00ES!!EKK$4$44J;;s##ekk#&6&66LA~~z!!a|##2YY 
 
		JuE# 	 	 	~~c"" u{{3'7'7%++c:J:J'J'Jcrc
$$ S)9)9EKK<L<L)L)Lcrc
	
5F|CRC(%*	
 	
 	
    
7>>eJwNNS<,gcrcl   N   )Z8   
 NN	/<$  
 4sL   /AB> >CC<FB'H?>H?AJ JJAK5 5LLc                V    |                      dd                              d          S )zRemove non-ASCII characters, replacing with closest ASCII equivalent or removing.

    Used as a last resort when the system encoding is ASCII and can't handle
    any non-ASCII characters (e.g. LANG=C on Chromebooks).
    asciiignore)errors)encodedecoder   s    r   _strip_non_asciirt     s(     ;;wx;0077@@@r   c                R   d}| D ] }t          |t                    s|                    d          }t          |t                    rt	          |          }||k    r||d<   d}nut          |t
                    r`|D ]]}t          |t                    rF|                    d          }t          |t                    rt	          |          }||k    r||d<   d}^|                    d          }t          |t                    rt	          |          }||k    r||d<   d}|                    d          }t          |t
                    r|D ]}	t          |	t                    rq|	                    di           }
t          |
t                    rF|
                    d          }t          |t                    rt	          |          }||k    r||
d<   d}|                                D ];\  }}|d	v r
t          |t                    rt	          |          }||k    r|||<   d}<"|S )
a  Strip non-ASCII characters from all string content in a messages list.

    This is a last-resort recovery for systems with ASCII-only encoding
    (LANG=C, Chromebooks, minimal containers).  Returns True if any
    non-ASCII content was found and sanitized.
    Fr%   Tr   r&   r'   r)   r*   >   r&   r+   r%   r'   )r   r   r,   r   rt   r   r   )r#   r   r-   r%   	sanitizedr.   r   r&   r'   r/   r1   r3   r   r   s                 r   _sanitize_messages_non_asciirw   #  s]    E /! /!#t$$ 	'')$$gs## 	)(11IG##!*I&& 	) ) )dD)) )88F++D!$,, )$4T$:$:	$,,+4DL$(EwwvdC   	(..ID  'FWW\**
j$'' 
	-  	- 	-b$'' -
B//B!"d++ -"$&&"5"5%gs33 -(8(A(AI(G332;;(,))++ 	! 	!JC???%%% !,U33	%%(CH E	! Lr   toolsc                     t          |           S )z7Strip non-ASCII characters from tool payloads in-place.)_sanitize_structure_non_ascii)rx   s    r   _sanitize_tools_non_asciir{   ^  s    (///r   c                2   d}g }t          |           D ]\  }}t          |t                    s|                    d          }t          |t                    sFg }|D ]F}t          |t                    r|                    d          dv rd}1|                    |           Gt          |          t          |          k     r<|r||d<   |                    d          dk    rd|d<   |                    |           t          |          D ]}| |= |S )	uB  Remove image_url content parts from all messages in-place.

    Called when a server signals it does not support images (e.g.
    "Only 'text' content type is supported.").  Mutates messages so the
    next API call sends text only.

    Preserves message alternation invariants:
      * ``tool``-role messages whose content was entirely images are replaced
        with a plaintext placeholder, NOT deleted — deleting them would leave
        the paired ``tool_call_id`` on the prior assistant message unmatched,
        which providers reject with HTTP 400.
      * Non-tool messages whose content becomes empty are dropped.  In
        practice this only hits synthetic image-only user messages appended
        for attachment delivery; real user turns always include text.

    Returns True if any image parts were removed.
    Fr%   type>   image	image_urlinput_imageTr+   toolu:   [image content removed — server does not support images])r   r   r   r,   r   r?   r>   reversed)r#   r   	to_deleterD   r-   r%   	new_partsr.   s           r   _strip_images_from_messagesr   c  sH   $ EIH%% $ $3#t$$ 	'')$$'4(( 		 	' 	'D$%% '$((6*:*:>c*c*c  &&&&y>>CLL(( 	$!*IF** "^I   ###i    QKKLr   c                0    dfd |            S )zCStrip non-ASCII characters from nested dict/list payloads in-place.Fc                &   t          | t                    ru|                                 D ]^\  }}t          |t                    rt	          |          }||k    r|| |<   d7t          |t          t
          f          r |           _d S t          | t
                    rnt          |           D ]`\  }}t          |t                    rt	          |          }||k    r|| |<   d7t          |t          t
          f          r |           _d S d S )NT)r   r   r   r   rt   r   r   )r   r   r   rv   r   r   r   s        r   r   z,_sanitize_structure_non_ascii.<locals>._walk  s5   dD!! 	!"jjll ! !
UeS)) ! 0 7 7I E))$-S	 $d|44 !E%LLL! ! d## 	!'oo ! !
UeS)) ! 0 7 7I E))$-S	 $d|44 !E%LLL	! 	!! !r   r    r!   s    @@r   rz   rz     s;    E! ! ! ! ! !* 
E'NNNLr   )r
   r   r"   r4   rG   rm   rt   rw   r{   r   rz   )r   r   r   r   )r   r   r   r   )r#   r   r   r   )r5   r   r   r   )rH   )rI   r   rJ   r   r   r   )rx   r   r   r   )__doc__
__future__r   r[   loggingra   typingr   	getLogger__name__rY   compiler
   r   r"   r4   rG   rm   rt   rw   r{   r   rz   __all__r    r   r   <module>r      sk    # " " " " "   				      		8	$	$ 
-..      BA A A AH' ' ' 'T^ ^ ^ ^ ^BA A A A8 8 8 8v0 0 0 0
- - - -`   :  r   