134 lines
4.6 KiB
Python
134 lines
4.6 KiB
Python
import aiohttp
|
|
from markdownify import markdownify
|
|
|
|
SUPPORTED_TEXT_MIMETYPES = [
|
|
"text/plain",
|
|
"text/html",
|
|
"text/css",
|
|
"text/csv",
|
|
"text/javascript",
|
|
"text/markdown",
|
|
"text/xml",
|
|
"text/yaml",
|
|
"text/rtf",
|
|
"text/x-python",
|
|
"text/x-c",
|
|
"text/x-java-source",
|
|
"text/x-lua",
|
|
"text/x-sh",
|
|
"text/x-sass",
|
|
"text/x-scss",
|
|
"application/javascript",
|
|
"application/json",
|
|
"application/xml",
|
|
"application/rtf",
|
|
"application/xhtml+xml",
|
|
"application/atom+xml",
|
|
"application/rss+xml",
|
|
"application/sql",
|
|
"application/ld+json",
|
|
"application/x-yaml",
|
|
]
|
|
|
|
|
|
async def searxng(query: str) -> list:
|
|
"""
|
|
Search the web with SearXNG.
|
|
|
|
Arguments:
|
|
query (str): The search query
|
|
Returns: a list of the first 10 search results.
|
|
"""
|
|
params = {
|
|
"q": query,
|
|
"format": "json",
|
|
"engines": "google,duckduckgo,brave"
|
|
}
|
|
|
|
# Use an aiohttp.ClientSession for making HTTP requests.
|
|
# The 'async with' ensures the session is properly closed when done.
|
|
async with aiohttp.ClientSession() as session:
|
|
try:
|
|
# Make an asynchronous GET request
|
|
# The 'async with' here ensures the response object is properly closed
|
|
async with session.get("https://searx.xorydev.xyz/search", params=params) as response:
|
|
# Raise an exception for bad status codes (4xx or 5xx)
|
|
response.raise_for_status()
|
|
|
|
# Await the JSON parsing of the response body
|
|
data = await response.json()
|
|
except aiohttp.ClientError as e:
|
|
# Catch any aiohttp-related errors (network issues, invalid URL, etc.)
|
|
print(f"Error making request to SearXNG: {e}")
|
|
return [] # Return an empty list on error
|
|
except Exception as e:
|
|
# Catch any other unexpected errors
|
|
print(f"An unexpected error occurred: {e}")
|
|
return []
|
|
|
|
results = []
|
|
# Safely get "results" array, defaulting to empty list if not present
|
|
for r in data.get("results", []):
|
|
title = r.get("title")
|
|
url = r.get("url")
|
|
|
|
# Only append if both title and URL are present and we have less than 10 results
|
|
if title and url and len(results) < 10:
|
|
results.append({"title": title, "url": url})
|
|
|
|
return results
|
|
|
|
|
|
async def open_url(url: str) -> dict:
|
|
"""
|
|
Opens a URL and returns its full content (if it's HTML, it will be converted to clean Markdown).
|
|
Use this when a `search` result's content is insufficient or when a user provides a direct URL to analyze.
|
|
"""
|
|
|
|
async with aiohttp.ClientSession(
|
|
headers={
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/138.0.0.0 Safari/537.36",
|
|
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
|
|
"Accept-Encoding": "gzip, deflate",
|
|
"Accept-Language": "en-US,en;q=0.9",
|
|
"Sec-Ch-Ua": '"Not)A;Brand";v="8", "Chromium";v="138", "Google Chrome";v="138"',
|
|
"Sec-Ch-Ua-Mobile": "?0",
|
|
"Sec-Ch-Ua-Platform": '"Windows"',
|
|
"Sec-Fetch-Dest": "document",
|
|
"Sec-Fetch-Mode": "navigate",
|
|
"Sec-Fetch-Site": "none",
|
|
"Sec-Fetch-User": "?1",
|
|
"Upgrade-Insecure-Requests": "1",
|
|
"Priority": "u=0, i",
|
|
},
|
|
) as session:
|
|
async with session.get(url) as response:
|
|
response.raise_for_status()
|
|
content_type = response.content_type.split(";")[0].strip()
|
|
content_length = response.content_length or 0
|
|
|
|
if content_type not in SUPPORTED_TEXT_MIMETYPES:
|
|
return {
|
|
"content_type": content_type,
|
|
"content_length": content_length,
|
|
"content": None,
|
|
}
|
|
|
|
if "text/html" in content_type:
|
|
content = markdownify(await response.text())
|
|
if len(content) > 262144:
|
|
content = content[:262144]
|
|
return {
|
|
"content_type": content_type,
|
|
"content_length": content_length,
|
|
"content": content,
|
|
}
|
|
|
|
content = await response.text()
|
|
if len(content) > 262144:
|
|
content = content[:262144]
|
|
return {
|
|
"content_type": content_type,
|
|
"content_length": content_length,
|
|
"content": content,
|
|
}
|