
    PL
jt                    \   U d Z ddlmZ ddlZddlZddlZddlmZmZm	Z	m
Z
mZ ddlmZ ddlmZ  ej        e          ZerddlmZ daded	<   d)dZ G d d          Z e            Zd*dZd+dZd,dZd,dZd,dZd+dZd-dZ d.dZ!d-dZ"d/dZ#d0d!Z$d1d#Z%d2d&Z& G d' d(e          Z'dS )3u  Firecrawl web search + extract — plugin form.

Subclasses :class:`agent.web_search_provider.WebSearchProvider`. This is
the largest provider migrated in this PR; it captures the full inline
firecrawl implementation that previously lived in tools/web_tools.py:

  - :data:`Firecrawl` lazy proxy that defers the ~200ms SDK import to
    first use (re-exported by tools.web_tools for backward compat with
    existing tests that mock that name).
  - :func:`_get_firecrawl_client` with direct + managed-gateway dual
    mode, controlled by ``web.use_gateway`` config when both are
    configured.
  - :func:`check_firecrawl_api_key` re-exported (tests + tools_config
    setup hint depend on this name living in tools.web_tools).
  - :func:`_extract_web_search_results` / :func:`_extract_scrape_payload`
    response-shape normalizers that handle SDK / direct API / gateway
    response variants.
  - Per-URL extract loop with 60s timeout, redirect-aware SSRF re-check,
    website-policy gating, and format-aware content selection.

Async note: the underlying SDK is sync. ``extract()`` is declared
``async def`` because it performs per-URL I/O that benefits from
running in an executor; the implementation wraps each scrape in
:func:`asyncio.to_thread` with :func:`asyncio.wait_for(timeout=60)` to
guard against hung fetches.

Config keys this provider responds to::

    web:
      search_backend: "firecrawl"     # explicit per-capability
      extract_backend: "firecrawl"    # explicit per-capability
      backend: "firecrawl"            # shared fallback (default)
      use_gateway: false              # prefer managed gateway when both
                                      # direct + gateway credentials exist

Env vars::

    FIRECRAWL_API_KEY=...            # direct cloud auth
    FIRECRAWL_API_URL=...            # self-hosted Firecrawl
    FIRECRAWL_GATEWAY_URL=...        # Nous tool-gateway (subscribers)
    TOOL_GATEWAY_DOMAIN=...          # alternate gateway env
    TOOL_GATEWAY_SCHEME=...
    TOOL_GATEWAY_USER_TOKEN=...
    )annotationsN)AnyDictListOptionalTYPE_CHECKING)WebSearchProvider)check_website_access	FirecrawlzOptional[type]_FIRECRAWL_CLS_CACHEreturntypec                     t           V	 ddlm}   | dd           n9# t          $ r Y n-t          $ r!}t          t          |                    d}~ww xY wddlm} |a t           S )z)Import and cache ``firecrawl.Firecrawl``.Nr   )ensurezsearch.firecrawlF)promptr   )r   tools.lazy_depsr   ImportError	Exceptionstr	firecrawlr   )_lazy_ensureexc_clss      B/home/kuhnn/.hermes/hermes-agent/plugins/web/firecrawl/provider.py_load_firecrawl_clsr   L   s     #	(>>>>>>L+E::::: 	 	 	D 	( 	( 	(c#hh'''	(//////#s    
A	AAAc                  .    e Zd ZdZdZddZdd
ZddZdS )_FirecrawlProxyzJCallable proxy that looks like ``firecrawl.Firecrawl`` but imports lazily. argsr   kwargsr   c                *     t                      |i |S N)r   )selfr    r!   s      r   __call__z_FirecrawlProxy.__call__c   s    $"$$d5f555    objboolc                :    t          |t                                S r#   )
isinstancer   )r$   r'   s     r   __instancecheck__z!_FirecrawlProxy.__instancecheck__f   s    #244555r&   r   c                    dS )Nz <lazy firecrawl.Firecrawl proxy>r   r$   s    r   __repr__z_FirecrawlProxy.__repr__i   s    11r&   N)r    r   r!   r   r   r   )r'   r   r   r(   r   r   )__name__
__module____qualname____doc__	__slots__r%   r+   r.   r   r&   r   r   r   ^   s[        TTI6 6 6 66 6 6 62 2 2 2 2 2r&   r   Optional[tuple]c                    t          j        dd                                          } t          j        dd                                                              d          }| s|sdS i }| r| |d<   |r||d<   |d|pd| pdffS )	zHReturn explicit direct Firecrawl kwargs + cache key, or None when unset.FIRECRAWL_API_KEY FIRECRAWL_API_URL/Napi_keyapi_urldirect)osgetenvstriprstrip)r;   r<   r!   s      r   _get_direct_firecrawl_configrB   z   s    i+R006688Gi+R006688??DDG 7 tF $#y $#yHgow$???r&   r   c                 8    ddl m}  |                     d          S )z,Return the configured Firecrawl gateway URL.r   Nr   )tools.web_tools	web_toolsbuild_vendor_gateway_url_wts    r   _get_firecrawl_gateway_urlrI      s(    !!!!!!''444r&   r(   c                 J    ddl m}  |                     d| j                  duS )a  Return True when gateway URL + Nous Subscriber token are available.

    Reads ``read_nous_access_token`` and ``resolve_managed_tool_gateway``
    via :mod:`tools.web_tools` rather than direct imports, so unit tests
    that ``patch("tools.web_tools._read_nous_access_token", ...)`` see
    their patches honored. The names are re-exported on
    :mod:`tools.web_tools` for exactly this reason.
    r   Nr   token_reader)rD   rE   resolve_managed_tool_gateway_read_nous_access_tokenrG   s    r   _is_tool_gateway_readyrO      sC     "!!!!!++#"= ,   r&   c                 "    t                      duS )zBReturn True when direct Firecrawl config is explicitly configured.N)rB   r   r&   r   _has_direct_firecrawl_configrQ      s    '))55r&   c                 :    t                      pt                      S )zReturn True when Firecrawl backend (direct or gateway) is usable.

    Re-exported by :mod:`tools.web_tools` for backward compatibility with
    existing tests and the ``hermes tools`` setup flow.
    )rQ   rO   r   r&   r   check_firecrawl_api_keyrS      s     ())E-C-E-EEr&   c                 @    ddl m}  |                                 sdS 	 dS )zAReturn optional managed-gateway guidance for Firecrawl help text.r   Nr8   zc, or use the Nous Tool Gateway via your subscription (FIRECRAWL_GATEWAY_URL or TOOL_GATEWAY_DOMAIN))rD   rE   managed_nous_tools_enabledrG   s    r   _firecrawl_backend_help_suffixrV      s<    !!!!!!))++ r	9 r&   Nonec                 b    ddl m}  d}|                                 r|dz  }t          |          )z>Raise a clear error for unsupported web backend configuration.r   NzWeb tools are not configured. Set FIRECRAWL_API_KEY for cloud Firecrawl or set FIRECRAWL_API_URL for a self-hosted Firecrawl instance.u    With your Nous subscription you can also use the Tool Gateway — run `hermes tools` and select Nous Subscription as the web provider.)rD   rE   rU   
ValueError)rH   messages     r   &_raise_web_backend_configuration_errorr[      sT    !!!!!!	0 
 %%'' 
S	
 W

r&   r   c                    ddl m}  t                      }||                     d          s|\  }}ne|                     d| j                  }|(t                              d           t                       |j	        |j
        d}d|d	         |j	        f}t          | d
d          }t          | dd          }|||k    r|S  | j        di || _        || _        | j        S )u6  Get or create the cached Firecrawl client.

    When ``web.use_gateway`` is set in config, the managed Tool Gateway is
    preferred even if direct Firecrawl credentials are present. Otherwise
    direct Firecrawl takes precedence when explicitly configured.

    Raises ValueError when neither path is usable.

    The cached client is stored on :mod:`tools.web_tools` (as
    ``_firecrawl_client`` and ``_firecrawl_client_config``) rather than on
    this plugin module so that unit tests that reset the cache via
    ``tools.web_tools._firecrawl_client = None`` keep working. Helper
    functions (``prefers_gateway``, ``resolve_managed_tool_gateway``,
    ``_read_nous_access_token``, ``Firecrawl``) are also looked up via
    :mod:`tools.web_tools` for the same reason — see
    :func:`_is_tool_gateway_ready`.
    r   Nwebr   rK   zTFirecrawl client initialization failed: missing direct config and tool-gateway auth.)r;   r<   ztool-gatewayr<   _firecrawl_client_firecrawl_client_configr   )rD   rE   rB   prefers_gatewayrM   rN   loggererrorr[   nous_user_tokengateway_origingetattrr   r^   r_   )rH   direct_configr!   client_configmanaged_gatewaycachedcached_configs          r   _get_firecrawl_clientrk      s5   $ "!!!!!022M )<)<U)C)C  -::c&A ; 
 
 "LL?   3444 '6&5
 

 9+
 S-t44FC!;TBBMm}<< *CM33F33C#0C   r&   c                 .    ddl m}  d| _        d| _        dS )zDrop the cached Firecrawl client so tests can re-instantiate cleanly.

    Clears the canonical slots on :mod:`tools.web_tools` (where
    :func:`_get_firecrawl_client` reads/writes them).
    r   N)rD   rE   r^   r_   rG   s    r   _reset_client_for_testsrm     s,     "!!!!! C#'C   r&   valuec           	     l   | dS t          | t          t          t          t          t
          t          f          r| S t          | d          r%	 |                                 S # t          $ r Y nw xY wt          | d          r4	 d | j
                                        D             S # t          $ r Y nw xY w| S )zBConvert SDK objects to plain python data structures when possible.N
model_dump__dict__c                D    i | ]\  }}|                     d           ||S )_)
startswith).0kvs      r   
<dictcomp>z$_to_plain_object.<locals>.<dictcomp>&  s0    UUUTQ1<<PSCTCTUAqUUUr&   )r*   dictlistr   intfloatr(   hasattrrp   r   rq   items)rn   s    r   _to_plain_objectr     s    }t%$c3t<== ul## 	##%%% 	 	 	D	 uj!! 	UUU^%9%9%;%;UUUU 	 	 	D	 Ls$   A   
A-,A-"B$ $
B10B1valuesList[Dict[str, Any]]c                    t          | t                    sg S g }| D ];}t          |          }t          |t                    r|                    |           <|S )z7Normalize mixed SDK/list payloads into a list of dicts.)r*   rz   r   ry   append)r   
normalizeditemplains       r   _normalize_result_listr   -  sg    fd## 	')J % % &&eT"" 	%e$$$r&   responsec                v   t          |           }t          |t                    r|                    d          }t          |t                    rt          |          S t          |t                    rLt          |                    d                    }|r|S t          |                    d                    }|r|S t          |                    d                    }|r|S t          |                    d                    }|r|S t          | d          rt          t          | dg                     S g S )zKExtract Firecrawl search results across SDK/direct/gateway response shapes.datar]   results)r   r*   ry   getrz   r   r}   re   )r   response_plainr   data_webdata_resultstop_webtop_resultss          r   _extract_web_search_resultsr   :  s4   %h//N.$'' !!&))dD!! 	0)$///dD!! 	$-dhhuoo>>H  1$((92E2EFFL $##(););E)B)BCC 	N,^-?-?	-J-JKK 	x D%ghr&B&BCCCIr&   scrape_resultDict[str, Any]c                    t          |           }t          |t                    si S |                    d          }t          |t                    r|S |S )zINormalize Firecrawl scrape payload shape across SDK and gateway variants.r   )r   r*   ry   r   )r   result_plainnesteds      r   _extract_scrape_payloadr   Y  sV    #M22LlD)) 	f%%F&$ r&   c                      e Zd ZdZedd            Zedd            ZddZddZdd	Z	dd
Z
dddZddZd dZd!dZdS )"FirecrawlWebSearchProviderz9Firecrawl search + extract provider with dual auth paths.r   r   c                    dS )Nr   r   r-   s    r   namezFirecrawlWebSearchProvider.namen      {r&   c                    dS )Nr   r   r-   s    r   display_namez'FirecrawlWebSearchProvider.display_namer  r   r&   r(   c                    t                      S )zHReturn True when direct Firecrawl OR managed-gateway path is configured.)rS   r-   s    r   is_availablez'FirecrawlWebSearchProvider.is_availablev  s    &(((r&   c                    dS NTr   r-   s    r   supports_searchz*FirecrawlWebSearchProvider.supports_searchz      tr&   c                    dS r   r   r-   s    r   supports_extractz+FirecrawlWebSearchProvider.supports_extract}  r   r&   c                    dS r   r   r-   s    r   supports_crawlz)FirecrawlWebSearchProvider.supports_crawl  r   r&      querylimitr{   r   c                   ddl m}  |            rdddS t                              d||           t	                      }	 |                    ||          }t          |          }t                              dt          |                     d	d
|idS # t          $ r-}t          	                    d|           dd| dcY d}~S d}~ww xY w)uz  Execute a Firecrawl search.

        Sync; matches the legacy ``_get_firecrawl_client().search(...)``
        call directly. Normalizes the response across SDK/direct/gateway
        shapes via :func:`_extract_web_search_results`.

        Pre-flight errors (``ValueError`` from configuration check,
        ``ImportError`` from missing SDK) propagate to the dispatcher's
        top-level handler, which wraps them as ``tool_error(...)`` —
        matching the legacy ``{"error": "Error searching web: ..."}``
        envelope. Only in-flight errors are caught and surfaced as
        ``{"success": False, "error": ...}``.
        r   is_interruptedFInterrupted)successrb   z!Firecrawl search: '%s' (limit=%d))r   r   z"Firecrawl: found %d search resultsTr]   )r   r   zFirecrawl search error: %szFirecrawl search failed: N)
tools.interruptr   ra   infork   searchr   lenr   warning)r$   r   r   r   clientr   web_resultsr   s           r   r   z!FirecrawlWebSearchProvider.search  s    	322222> 	>$}===7FFF '((	R}}5}>>H5h??KKK<c+>N>NOOO#e[-ABBB 	R 	R 	RNN7===$/P3/P/PQQQQQQQQ	Rs   AB 
C "CCCurls	List[str]r!   r   r   c                  K   ddl m}  |            rd |D             S |                    d          }g }|dk    rdg}n|dk    rdg}nddg}g }|D ]} |            r|                    |ddd	           't	          |          }|r`t
                              d
|d         |d                    |                    |dd|d         |d         |d         |d         dd           	 t
                              d|           	 t          j        t          j	        t                      j        ||          d           d{V }	nK# t          j        $ r9 t
                              d|           |                    |dddd           Y ?w xY wt          |	          }
|
                    di           }|
                    d          }|
                    d          }t          |t                     s?t#          |d          r|                                }nt#          |d          r|j        }ni }|                    dd          }|                    d|          }t	          |          }|rbt
                              d|d         |d                    |                    ||dd|d         |d         |d         |d         dd           |dk    s||r|}n|p|pd}|                    |||||d            # t(          $ rO}t
                              d!||           |                    |dddt-          |          d"           Y d}~	d}~ww xY w|S )#a  Extract content from one or more URLs via Firecrawl.

        Async; each URL is scraped in a background thread with a 60s
        timeout. After scraping, the final URL (post-redirect) is
        re-checked against website-access policy.

        Accepted kwargs (others ignored for forward compat):
          - ``format``: ``"markdown"`` or ``"html"``; default is both
            (request both, return markdown when available).

        Returns the legacy per-URL list-of-results shape. Per-URL failures
        (timeout, SSRF block, scrape error, policy block) become items
        with an ``error`` field rather than raising.
        r   r   c                    g | ]}|d dd	S )r   r8   urlrb   titler   )ru   us     r   
<listcomp>z6FirecrawlWebSearchProvider.extract.<locals>.<listcomp>  s"    RRRACCRRRr&   formatmarkdownhtmlr   r8   r   z%Blocked web_extract for %s by rule %shostrulerZ   sourcer   r   r   )r   r   contentrb   blocked_by_policyzFirecrawl scraping: %s)r   formats<   )timeoutNz!Firecrawl scrape timed out for %suc   Scrape timed out after 60s — page may be too large or unresponsive. Try browser_navigate instead.r   r   r   rb   metadatarp   rq   r   	sourceURLz0Blocked redirected web_extract for %s by rule %sr   r   r   raw_contentrb   r   r   r   r   r   r   z"Firecrawl scrape failed for %s: %s)r   r   r   r   rb   )r   r   r   r   r
   ra   r   asynciowait_for	to_threadrk   scrapeTimeoutErrorr   r   r*   ry   r}   rp   rq   r   debugr   )r$   r   r!   _is_interruptedr   r   r   r   blockedr   scrape_payloadr   content_markdowncontent_htmlr   	final_urlfinal_blockedchosen_content
scrape_errs                      r   extractz"FirecrawlWebSearchProvider.extract  s      	FEEEEE? 	SRRTRRRRH%%Z!lGGvhGG!6*G )+ x	 x	C   s]RPPQQQ +3//G ;FOFO  
 "!##%!(!3$+FO$+FO&-h&7. .
 
   \4c:::*1*:)133: #$+  
 !#+ + + % % % % % %MM +   NN#FLLLNN#&%'')!Q 
 
 
 H "9!G!G)--j"==#1#5#5j#A#A -11&99 "(D11 &x66 &#+#6#6#8#8 :66 &#+#4#% Wb11$LLc::	 !5Y ? ?  KKJ%f-%f-  
 NN#,%*')+-%29%=(5f(=(5f(=*7*A2 2     Z''FN?ON%5NN%1%K5E%KN(!&#1'5$,      
 
 
A3
SSS"!##%')!$Z        
 sF   +LAE
	L
AFLFEL.L
M AMM r   c                
	  K   	 ddl m}  |            r
d|ddddgiS |                    d          }|                    dd	          }|rt                              d
           t                              d||           |ddgid}t          j        t                      j        fd|i| d{V }g }t          |d          rI|j
        r|j
        ng }t                              dt          |dd          t          |                     n_t          |t                    rd|v r|                    dg           pg }n-t                              dt!          |          j                   g }	|D ]}
d}d}i }t          |
d          rU|
                                }|                    d          }|                    d          }|                    di           }nt          |
d          rt          |
dd          }t          |
dd          }t          |
di           }t          |d          r|                                }nt          |d          r|j        }npt          |t                    r|}nXi }nUt          |
t                    r@|
                    d          }|
                    d          }|
                    di           }t          |t                    s?t          |d          r|                                }nt          |d          r|j        }ni }|                    d|                    dd                    }|                    dd          }t)          |          }|rbt                              d|d         |d                    |	                    ||dd|d          |d         |d         |d!         d"d#           e|p|pd}|	                    |||||d$           d|	iS # t,          $ r!}d|ddt/          |          dgicY d}~S d}~wt0          $ r}d|ddd%| dgicY d}~S d}~wt2          $ r2}t                              d&|           d|ddd'| dgicY d}~S d}~ww xY w)(u  Crawl a seed URL via Firecrawl's ``/crawl`` endpoint.

        Sync SDK call wrapped in ``asyncio.to_thread`` because the dispatcher
        in :func:`tools.web_tools.web_crawl_tool` is async and runs LLM
        post-processing on the response. The dispatcher gates the seed URL
        against SSRF + website-access policy before calling us; this method
        re-checks every crawled page's URL against the policy after the
        crawl returns to catch redirected pages that map to a blocked host.

        Accepted kwargs (others ignored for forward compat):
          - ``instructions``: str — logged then dropped. Firecrawl's /crawl
            endpoint does NOT accept natural-language instructions (that's
            an /extract feature), so we record the value for debugging and
            proceed without it. Tavily's crawl IS instruction-aware; this
            divergence is documented in both plugins' docstrings.
          - ``limit``: int — max pages to crawl (default 20).
          - ``depth``: str — accepted for API parity with Tavily; ignored
            by Firecrawl's crawl endpoint.

        Returns ``{"results": [...]}`` matching the shape that
        :func:`tools.web_tools.web_crawl_tool`'s shared LLM-summarization
        path expects. Per-page failures (policy block on redirected URL,
        bad response shape) are included as items with an ``error`` field
        rather than raising.
        r   r   r   r8   r   r   instructionsr      zUFirecrawl crawl: 'instructions' parameter ignored (not supported by Firecrawl /crawl)zFirecrawl crawl: %s (limit=%d)r   r   )r   scrape_optionsr   Nr   z$Firecrawl crawl status: %s, %d pagesstatusunknownz*Firecrawl crawl: unexpected result type %rrp   r   r   rq   r   zUnknown URLr   z"Blocked crawled page %s by rule %sr   r   rZ   r   r   r   r   zFirecrawl SDK not installed: zFirecrawl crawl error: %szFirecrawl crawl failed: )r   r   r   ra   r   r   r   rk   crawlr}   r   re   r   r*   ry   r   r   r0   rp   rq   r
   r   rY   r   r   r   )r$   r   r!   r   r   r   crawl_paramscrawl_result	data_listpagesr   r   r   r   	item_dictmetadata_objpage_urlr   page_blockedr   r   s                        r   r   z FirecrawlWebSearchProvider.crawlB  s     4X	666666~ g!C"Vc$d$d#eff!::n55LJJw++E  :  
 KK8#uEEE #,zl"; L ")!2%''-" "" " "      L $&I|V,, 1=1BJL--	:L(I>>	NN   
 L$// Fl4J4J(,,VR88>B		@&&/  
 +-E! K K#' # "4.. 8 $ 1 1I'0}}Z'@'@$#,==#8#8L(}}Z<<HHT:.. 8'.tZ'F'F$#*4#>#>L#*4R#@#@L|\:: &#/#:#:#<#< z:: &#/#8#L$77 &#/#%d++ 8'+xx
';';$#'88F#3#3L#xx
B77H "(D11 &x66 &#+#6#6#8#8 :66 &#+#4#%#<<e]!C!C  !Wb11  4H== KK<$V,$V,  
 LL#+%*')+-%1)%<(4V(<(4V(<*6x*@2 2    *@l@b'!&#*'.$,     u%% 	^ 	^ 	^bRRUVYRZRZ [ [\]]]]]]] 
	 
	 
	"!##%!F!F!F	 	 	 	 	 	 	 	  	 	 	NN6<<<"!##%!AC!A!A	 	 	 	 	 	 	 		sG   O> OO> >
RP$R$R1Q=RR'Q=7R=Rc                    dddddddgdS )	Nr   u   paid · optional gatewayzQFull search + extract + crawl; supports direct API and Nous tool-gateway routing.r7   z2Firecrawl API key (or leave blank for self-hosted)z'https://docs.firecrawl.dev/introduction)keyr   r   )r   badgetagenv_varsr   r-   s    r   get_setup_schemaz+FirecrawlWebSearchProvider.get_setup_schema  s6    /-
 /RD 
 
 	
r&   Nr/   r   r(   )r   )r   r   r   r{   r   r   )r   r   r!   r   r   r   )r   r   r!   r   r   r   )r   r   )r0   r1   r2   r3   propertyr   r   r   r   r   r   r   r   r   r   r   r&   r   r   r   k  s       CC   X    X) ) ) )         R R R R R@] ] ] ]~r r r rh
 
 
 
 
 
r&   r   )r   r   )r   r5   r/   r   )r   rW   )r   r   )rn   r   r   r   )r   r   r   r   )r   r   r   r   )r   r   r   r   )(r3   
__future__r   r   loggingr>   typingr   r   r   r   r   agent.web_search_providerr	   tools.website_policyr
   	getLoggerr0   ra   r   r   FirecrawlSDKr   __annotations__r   r   rB   rI   rO   rQ   rS   rV   r[   rk   rm   r   r   r   r   r   r   r&   r   <module>r     si  + + +Z # " " " " "   				 ; ; ; ; ; ; ; ; ; ; ; ; ; ; 7 7 7 7 7 7 5 5 5 5 5 5		8	$	$  4333333'+  + + + +       $2 2 2 2 2 2 2 2 O	@ @ @ @"5 5 5 5    6 6 6 6
F F F F	 	 	 	   "5! 5! 5! 5!p	( 	( 	( 	("   .
 
 
 
   >
 
 
 
$Z
 Z
 Z
 Z
 Z
!2 Z
 Z
 Z
 Z
 Z
r&   