huginn/tools.py

import aiohttp
import subprocess
import asyncio
from markdownify import markdownify
from google.genai import types

SUPPORTED_TEXT_MIMETYPES = [
    "text/plain",
    "text/html",
    "text/css",
    "text/csv",
    "text/javascript",
    "text/markdown",
    "text/xml",
    "text/yaml",
    "text/rtf",
    "text/x-python",
    "text/x-c",
    "text/x-java-source",
    "text/x-lua",
    "text/x-sh",
    "text/x-sass",
    "text/x-scss",
    "application/javascript",
    "application/json",
    "application/xml",
    "application/rtf",
    "application/xhtml+xml",
    "application/atom+xml",
    "application/rss+xml",
    "application/sql",
    "application/ld+json",
    "application/x-yaml",
]

SUPPORTED_IMAGE_DOCUMENT_MIMETYPES = [
    "application/pdf",
    "image/png",
    "image/apng",
    "image/jpeg"
]


async def searxng(query: str) -> list:
    """
      Search the web with SearXNG.

      Arguments:
       query (str): The search query
      Returns: a list of the first 10 search results.
    """
    params = {
        "q": query,
        "format": "json",
        "engines": "google,duckduckgo,brave"
    }

    # Use an aiohttp.ClientSession for making HTTP requests.
    # The 'async with' ensures the session is properly closed when done.
    async with aiohttp.ClientSession() as session:
        try:
            # Make an asynchronous GET request
            # The 'async with' here ensures the response object is properly closed
            async with session.get("https://searx.xorydev.xyz/search", params=params) as response:
                # Raise an exception for bad status codes (4xx or 5xx)
                response.raise_for_status()

                # Await the JSON parsing of the response body
                data = await response.json()
        except aiohttp.ClientError as e:
            # Catch any aiohttp-related errors (network issues, invalid URL, etc.)
            print(f"Error making request to SearXNG: {e}")
            return []  # Return an empty list on error
        except Exception as e:
            # Catch any other unexpected errors
            print(f"An unexpected error occurred: {e}")
            return []

    results = []
    # Safely get "results" array, defaulting to empty list if not present
    for r in data.get("results", []):
        title = r.get("title")
        url = r.get("url")

        # Only append if both title and URL are present and we have less than 10 results
        if title and url and len(results) < 10:
            results.append({"title": title, "url": url})

    return results


async def open_url(url: str) -> dict | types.Part:
    """
    Opens a URL and returns its full content (if it's HTML, it will be converted to clean Markdown).
    Use this when a `search` result's content is insufficient or when a user provides a direct URL to analyze.
    """
    async with aiohttp.ClientSession(
        headers={
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
            "Accept-Encoding": "gzip, deflate",
            "Accept-Language": "en-US,en;q=0.9",
            "Sec-Ch-Ua": '"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"',
            "Sec-Ch-Ua-Mobile": "?0",
            "Sec-Ch-Ua-Platform": '"Windows"',
            "Sec-Fetch-Dest": "document",
            "Sec-Fetch-Mode": "navigate",
            "Sec-Fetch-Site": "none",
            "Sec-Fetch-User": "?1",
            "Upgrade-Insecure-Requests": "1",
            "Priority": "u=0, i",
        },
    ) as session:
        async with session.get(url) as response:
            response.raise_for_status()
            content_type = response.content_type.split(";")[0].strip()
            content_length = response.content_length or 0

            if content_type not in SUPPORTED_TEXT_MIMETYPES + SUPPORTED_IMAGE_DOCUMENT_MIMETYPES:
                return {
                    "content_type": content_type,
                    "content_length": content_length,
                    "content": None,
                }

            if content_type in SUPPORTED_IMAGE_DOCUMENT_MIMETYPES:
                return types.Part.from_bytes(
                    data=await response.read(),
                    mime_type=content_type
                )

            if "text/html" in content_type:
                content = markdownify(await response.text())
                if len(content) > 262144:
                    content = content[:262144]
                return {
                    "content_type": content_type,
                    "content_length": content_length,
                    "content": content,
                }

            content = await response.text()
            if len(content) > 262144:
                content = content[:262144]
            return {
                "content_type": content_type,
                "content_length": content_length,
                "content": content,
            }


async def run_command(command: str) -> tuple[str, str, int]:
    """
    Runs a shell command on the host machine and captures its stdout, stderr and error code.
    Args:
     command: str
    Returns:
     tuple containing: stdout, stderr and error code (in that order)
    """
    process = await asyncio.create_subprocess_shell(
        command,
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE
    )

    stdout_data, stderr_data = await process.communicate()
    stdout = stdout_data.decode().strip()
    stderr = stderr_data.decode().strip()
    return_code = process.returncode
    if return_code is None:
        raise TypeError
    return stdout, stderr, return_code