
    PL
jTC                       U d Z ddlmZ ddlZddlZddlZddlZddlZddlZddl	m
Z
 ddlmZmZmZmZ i Zded<   dD ]Z eeed          Zeee ee          <    d)dZd*dZd+dZd,dZd-d.dZddd/d Zd0d"Zd0d#Zd1d&Zd2d(ZdS )3u  Shutdown forensics — capture context when the gateway receives SIGTERM/SIGINT.

The gateway's ``shutdown_signal_handler`` runs synchronously inside the
asyncio event loop.  We can't safely block it for long, but we DO want a
durable record of who/what triggered the shutdown so that "the gateway
keeps dying" incidents can be diagnosed after the fact.

This module exposes :func:`snapshot_shutdown_context`, a fast (<10ms),
non-blocking probe that returns a structured dict the signal handler can
log immediately, plus :func:`spawn_async_diagnostic`, a fire-and-forget
``ps`` walk that runs as a detached subprocess so it can't block teardown
even if /proc is wedged.

Anything that needs to wait (e.g. shelling out to ``ps aux``) belongs in
the async helper, never in the synchronous probe.
    )annotationsN)Path)AnyDictListOptionalzDict[int, str]_SIGNAL_NAME_BY_NUM)SIGTERMSIGINTSIGHUPSIGQUITSIGUSR1SIGUSR2sigr   returnstrc                    | dS 	 t          |           }n&# t          t          f$ r t          |           cY S w xY wt                              |d|           S )zBReturn a human-readable signal name (or ``str(sig)`` as fallback).NUNKNOWNzsignal#)int	TypeError
ValueErrorr   r	   get)r   sig_ints     >/home/kuhnn/.hermes/hermes-agent/gateway/shutdown_forensics.py_signal_namer   %   sl    
{yc((z"   3xx""7,?g,?,?@@@s     99pidr   keyOptional[str]c                R   	 t          d|  dd          5 }|D ]V}|                    |dz             r<|                    dd          d                                         c cddd           S W	 ddd           n# 1 swxY w Y   n# t          t
          t          f$ r Y nw xY wdS )zIRead a single field from /proc/<pid>/status.  Linux only; None elsewhere./proc/z/statusutf-8encoding:   N)open
startswithsplitstripFileNotFoundErrorPermissionErrorOSError)r   r   fhlines       r   _read_proc_fieldr/   0   s(   '3''''::: 	9b 9 9??39-- 9::c1--a0668888	9 	9 	9 	9 	9 	9 	9 	999	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 	9 8   4sA   B
 AA>#B
 0A>2B
 >BB
 BB
 
B$#B$c                D   	 t          d|  dd          5 }|                                }ddd           n# 1 swxY w Y   n# t          t          t          f$ r Y dS w xY w|sdS |                    dd                              dd	                                          S )
zLRead /proc/<pid>/cmdline as a printable string.  Linux only; None elsewhere.r    z/cmdlinerbN        r!   replace)errors)r&   readr*   r+   r,   r4   decoder)   )r   r-   datas      r   _read_proc_cmdliner9   <   s    (3((($// 	27799D	 	 	 	 	 	 	 	 	 	 	 	 	 	 	8   tt t<<&&--gi-HHNNPPPs,   A 7A ;A ;A AADict[str, Any]c                   d| i}| dk    r|S t          | d          }|||d<   t          | d          }|||d<   t          | d          }|$	 t          |          |d	<   n# t          $ r Y nw xY wt          | d
          }|!|r|                                d         n||d<   t	          |           }|r|dd         |d<   |S )zCompact /proc/<pid> snapshot: pid, ppid, state, uid, cmdline.

    Best-effort.  Missing fields are simply omitted rather than raising.
    r   r   NameNnameStatestatePPidppidUiduid,  cmdline)r/   r   r   r(   r9   )r   summaryr=   r?   rA   rC   rE   s          r   _proc_summaryrG   I   s   
  %clG
axxC((DS'**E C((D	!$iiGFOO 	 	 	D	
3
&
&C
+.7QC %%G +$TcT]	Ns   A! !
A.-A.received_signalc           
     f   t          j                     }t          j                    }t          j                    }t          j                    }||t          |           | t          |           nd||t          |          t          |          d}t          j        	                    d          }|r||d<   t          j        	                    d          }|r||d<   t          |          p|dk    |d<   	 t          j                    d	         |d
<   n# t          t          f$ r Y nw xY w	 t          |d          }|c|dk    r]|                                rt          |          n||d<   |                                rt          t          |                    nd|d<   n# t           t"          f$ r Y nw xY w	 t          j        	                    d          }	|	rt%          |	          dz  }
|
                                rF	 |
                    d          }|dd         |d<   d| |v pd| |v |d<   n# t          $ r Y nw xY wt%          |	          dz  }|                                r5	 |                    d          }|dd         |d<   n# t          $ r Y nw xY wn# t*          $ r Y nw xY w|S )a'  Fast (<10ms) snapshot of who/what is asking us to shut down.

    Captures:

    * The signal number/name (so SIGINT vs SIGTERM is visible)
    * Our own PID/ppid + parent process info from /proc (Linux)
    * Whether systemd is our parent (``ppid==1`` or ``INVOCATION_ID`` set)
    * Whether takeover/planned-stop markers exist (consumed lazily by the caller)
    * /proc/self limits + load average (1-min)
    * Wall-clock and monotonic timestamps for cross-correlating later phases

    Pure stdlib, never raises, never blocks on subprocesses.
    N)tsts_monotonicsignal
signal_numr   rA   parentselfINVOCATION_IDsystemd_invocation_idJOURNAL_STREAMsystemd_journal_streamr%   under_systemdr   
loadavg_1m	TracerPid0
tracer_pidtracerHERMES_HOMEz.gateway-takeover.jsonr!   r"   rD   takeover_markerz"target_pid": z'target_pid': takeover_marker_for_selfz.gateway-planned-stop.jsonplanned_stop_marker)time	monotonicosgetpidgetppidr   r   rG   environr   bool
getloadavgr,   AttributeErrorr/   isdigitr   r   r   exists	read_text	Exception)rH   nowr_   r   rA   ctxinvocation_idjournal_streamrY   hermes_home_strtakeover_pathrawplanned_stop_paths                r   snapshot_shutdown_contextrs   h   s0    )++C  I
)++C:<<D !//.=.Ic/***t%%c""	 	C JNN?33M 5'4#$Z^^$455N 7(6$%..;$!)CMOOA.L^$   !#{33&C--/5~~/?/? KFVC:@..:J:JTM#f++666PTCMz"   *..77 	 114LLM##%% 	'1171CCC-0#YC)*...#5 91C11S8 233    D $_ 5 58T T '')) +55w5GGC14TcTC-..   D    Js   5D D&%D&*A5F   F43F48AJ!  4H5 4J! 5
I?J! I)J! ,#J J! 
JJ! JJ! !
J.-J.g      @)timeout_secondslog_pathr   signal_namert   floatOptional[int]c               R   	 | j                             dd           n# t          $ r Y dS w xY wt          j        dk    rdS d| dt          j                     d}	 t          j        t          |           t
          j	        t
          j
        z  t
          j        z  d          }n# t          $ r Y dS w xY w	 t          j        d	|d
dd|g|t          j        t          j        dd          }nd# t           t          f$ rP 	 t          j        |           n# t          $ r Y nw xY wY 	 t          j        |           dS # t          $ r Y dS w xY ww xY w	 	 t          j        |           n:# t          $ r Y n.w xY w# 	 t          j        |           w # t          $ r Y w w xY wxY w|j        S )a  Fire-and-forget ``ps``-style snapshot written to ``log_path``.

    Runs as a detached subprocess so it can't block the asyncio event loop
    or compete with platform teardown.  The subprocess uses its own
    ``timeout`` so a wedged ``ps`` still self-cleans within
    ``timeout_seconds``.

    Returns the subprocess PID on success, ``None`` on failure.  Never
    raises.

    We deliberately avoid ``subprocess.run(["ps", "aux"])`` from inside the
    signal handler (the pre-existing pattern): on a busy host with hundreds
    of processes, ``ps aux`` can take >2s to walk /proc, during which the
    asyncio loop is frozen and adapter teardown can't begin.
    T)parentsexist_okNwin32z echo '=== shutdown diagnostic @ z ==='; echo '--- date ---'; date -u +%Y-%m-%dT%H:%M:%SZ; echo '--- ps auxf (top 60 by cpu) ---'; ps auxf --sort=-pcpu 2>/dev/null | head -60; echo '--- pstree of self ---'; pstree -plau a   2>/dev/null | head -40 || true; echo '--- /proc/loadavg ---'; cat /proc/loadavg 2>/dev/null || true; echo '--- recent dmesg (oom/killed) ---'; dmesg -T 2>/dev/null | tail -20 || journalctl --user -n 20 --no-pager 2>/dev/null | tail -20 || true; echo '=== end ==='i  timeoutz.0fbashz-c)stdoutstderrstdinstart_new_session	close_fds)rN   mkdirr,   sysplatformr`   ra   r&   r   O_WRONLYO_CREATO_APPEND
subprocessPopenSTDOUTDEVNULLr*   closer   )ru   rv   rt   scriptfdprocs         r   spawn_async_diagnosticr      s3   *dT::::   tt |wt
	; 
	 
	
 	
	 
	 
	  WS]]BK"*$<r{$JERR   tt ?00&$G$$"
 
 
 w'   	HRLLLL 	 	 	D		HRLLLLL 	 	 	DD	 	 	HRLLLL 	 	 	D		HRLLLL 	 	 	D	 8Os    
--AB% %
B32B375C- ,E8 -E?DE
D!E D!!E$E8 &D< <
E
	E
EE8 E( (
E54E58F:FF
FFFFrl   c                2   |                      dd          }|                      d          pi }|                     dd          }|                     d          pd}|                     d          pd}|                      d          rd	nd
}|                      d          }t          |t          t          f          r|dnd}g }	|                      d          1|                      d          }
|	                    d|
rdnd            |                      d          |	                    d           |                      d          r|	                    d| d                     |	rdd                    |	          z   nd}d| d| d| d| d| | d|S )z?Render a shutdown context dict as a single, scannable log line.rL   ?rN   rE   z	(unknown)r=   r   rT   yesnorU   z.2fr[   Nr\   ztakeover_marker_present=rO   otherr]   zplanned_stop_marker_present=yesrX   ztracer_pid=  zsignal=z under_systemd=z parent_pid=z parent_name=z loadavg_1m=z parent_cmdline=)r   
isinstancer   rw   appendjoin)rl   r   rN   
parent_cmdparent_name
parent_pidrT   loadload_strextrasfor_self
extras_strs               r   format_context_for_logr     s   
''(C
 
 CWWX$"FI{33J**V$$+KE"")cJ WW_55?EE4M77<  D *4#u > >G$}}}CHF
ww !!-77566H'FvvwHH	
 	
 	
 ww$%%17888
ww| 97C$577888-3;#((((J	)# 	) 	)&	) 	) 	) 	) #	) 	) 		)
 	) 	) %	) 	)    c                l    	 t          j        | t          d          S # t          t          f$ r Y dS w xY w)zFJSON-serialise a context dict for structured ingestion.  Never raises.T)default	sort_keysz{})jsondumpsr   r   r   )rl   s    r   context_as_jsonr   :  sE    z#sd;;;;z"   tts    33drain_timeoutOptional[Dict[str, Any]]c                   t           j                            d          }|sdS d}	 t          dd          5 }|D ]\}d|v rV|                                                    d          }t          |          D ]}|                    d          r|} n|r n]ddd           n# 1 swxY w Y   n# t          t          f$ r Y nw xY w|sdS d}dgg fD ]}	 t          j        d	g|d
|dddd          }	n"# t          t          j        t          f$ r Y Cw xY w|	j        dk    rS|	j                                        D ]|}|                    d          re|                    dd          d                                         }
|
                                rt%          |
          }nt'          |
          }| n}| n|dS |dz  }d}| |z   }||| |||k     dS )u.  At startup, sanity-check that systemd's TimeoutStopSec >= drain_timeout.

    When the gateway is run under a stale systemd unit file (e.g. the user
    upgraded hermes-agent but never re-ran ``hermes setup`` to regenerate
    the unit), ``TimeoutStopSec`` can be smaller than the configured
    ``restart_drain_timeout``.  Result: SIGTERM arrives, the drain starts,
    and systemd SIGKILLs the cgroup mid-drain — looks like a phantom kill
    in the journal because the journal only logs ``code=killed status=9``.

    Returns ``None`` when the alignment is fine OR we can't determine it
    (not running under systemd, ``systemctl`` unavailable, etc.).  Returns
    a dict with ``timeout_stop_sec`` + ``drain_timeout`` + ``mismatch``
    bool when we have data to report.

    Best-effort.  Never raises.
    rP   Nz/proc/self/cgroupr!   r"   z.service/z--user	systemctlshowz--property=TimeoutStopUSecTg       @)capture_outputtextr}   r   zTimeoutStopUSec==r%   g    .Ag      >@)unittimeout_stop_secr   expected_minmismatch)r`   rc   r   r&   r)   r(   reversedendswithr,   r*   r   runTimeoutExpired
returncoder   
splitlinesr'   rg   r   _parse_systemd_duration_to_us)r   rm   	unit_namer-   r.   partsp
timeout_usflagresultvaluer   headroomexpecteds                 r   check_systemd_timing_alignmentr   B  s   " JNN?33M t  $I%888 
	B 	 	%% JJLL..s33E%e__ " "::j11 "()I!E" ! 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 
	 &'    t
 !%JR   	^UtUVUYU8TU#$  FF ":#<gF 	 	 	H	!!M,,.. 		 		D122 

3**1-3355==?? F!$UJJ!>u!E!EJ)E!E " t!K/ Hx'H,& $x/  sG   B0 A B$B0 $B((B0 +B(,B0 0CC!C77DDrq   c                   | sdS dddddddd}d}d	}d	}| d
z   D ]O}|                                 s|dk    rp|rh|                    |                                          }||s dS 	 |t          t	          |          |z            z  }n# t
          $ r Y  dS w xY wd	}d	}||z  }|                                r||z  }|rj|rh|                    |                                          }| dS 	 |t          t	          |          |z            z  }n# t
          $ r Y  dS w xY wd	}d	}|r:|s8	 |t          t	          |          dz            z  }n# t
          $ r Y  dS w xY wd	}Q|dk    r|ndS )zParse 'TimeoutStopUSec=1min 30s' / '90s' style values to microseconds.

    systemd accepts a wide grammar; we cover the common cases (s, ms, min,
    h) and return None on anything unexpected.  Never raises.
    Nr%   i  i@B i l    $'- )usmsssecminhhrr   r   r   .)rg   r   lowerr   rw   r   isalpha)rq   unitstotal_ustokendigitsch
multipliers          r   r   r     s     t E HEFCi    ::<< 	299 
"YYu{{}}55
%V%44 E&MMJ$> ? ??HH!      444 bLFFZZ\\ 	RKEE 	 	5;;==11J!ttCf
 :;;;   tttFEE 	E 	Cf	 9:::   tttF!||88-s6   ("B
BB2"D
D$#D$2"E
E$#E$)r   r   r   r   )r   r   r   r   r   r   )r   r   r   r   )r   r   r   r:   )N)rH   r   r   r:   )ru   r   rv   r   rt   rw   r   rx   )rl   r:   r   r   )r   rw   r   r   )rq   r   r   rx   ) __doc__
__future__r   r   r`   rL   r   r   r^   pathlibr   typingr   r   r   r   r	   __annotations___namegetattr_valr   r   r/   r9   rG   rs   r   r   r   r   r    r   r   <module>r      s    " # " " " " "  				      



        , , , , , , , , , , , , ')  ( ( ( (M / /E765$''D).CCII&A A A A	 	 	 	
Q 
Q 
Q 
Q   >Z Z Z Z ZB !	Q Q Q Q Q Qh   B   T T T Tn5. 5. 5. 5. 5. 5.r   