Spaces:

FractalAIResearch
/

Fathom-DeepResearch

Running

App Files Files Community

Fathom-DeepResearch / web_agents_5 /fetchers /github_fetcher.py

Tasmay-Tib

init

5ab87e0 2 months ago

raw

history blame contribute delete

2.84 kB

	from __future__ import annotations
	from config import CFG, _SESS, _RND
	import logging
	import re
	from bs4 import BeautifulSoup
	import functools
	import random
	import requests
	import trafilatura
	import time
	from web_helpers import retry, fetch_blocked_site # ⬅️ shared


	def fetch_github(url: str) -> str:
	def _markdown_cleanup(md: str) -> str:
	md = re.sub(r"```.*?```", "", md, flags=re.S)
	md = re.sub(r"^#+\s*", "", md, flags=re.M)
	return re.sub(r"[ \t]{2,}", " ", md).strip()

	hdr = {"User-Agent": "ii-research-bot/0.6"}
	try:
	m = re.match(r"https?://github\.com/([^/]+)/([^/]+?)(?:\.git)?/?$", url)
	if m:
	owner, repo = m.groups()
	api = f"https://api.github.com/repos/{owner}/{repo}/readme"
	hdr_api = hdr \| {"Accept": "application/vnd.github.v3.raw"}
	if (tok := os.getenv("GITHUB_TOKEN")):
	hdr_api["Authorization"] = f"Bearer {tok}"
	r = _SESS.get(api, headers=hdr_api, timeout=(CFG.connect_to, CFG.read_to))
	if r.ok and len(r.text) > 30:
	return _markdown_cleanup(r.text)[:CFG.text_cap]

	if "/blob/" in url or "/tree/" in url:
	raw = re.sub(
	r"https://github\.com/([^/]+)/([^/]+)/(?:blob\|tree)/",
	r"https://raw.githubusercontent.com/\\1/\\2/",
	url,
	count=1,
	).split("?", 1)[0]
	r = _SESS.get(raw, headers=hdr, timeout=(CFG.connect_to, CFG.read_to))
	if r.ok and "text" in (r.headers.get("content-type") or "") and len(r.text) > 0:
	return r.text[:CFG.text_cap]

	raw1 = url + ("?raw=1" if "?" not in url else "&raw=1")
	r = _SESS.get(raw1, headers=hdr, timeout=(CFG.connect_to, CFG.read_to))
	if r.ok and "text" in (r.headers.get("content-type") or "") and len(r.text) > 0:
	return r.text[:CFG.text_cap]

	plain = url + ("?plain=1" if "?" not in url else "&plain=1")
	html = _SESS.get(plain, headers=hdr, timeout=(CFG.connect_to, CFG.read_to)).text
	soup = BeautifulSoup(html, "lxml")
	pre = soup.find("pre")
	if pre and len(pre.text) > 10:
	return pre.text[:CFG.text_cap]

	if "raw.githubusercontent.com" in url:
	r = _SESS.get(url.split("?", 1)[0], headers=hdr, timeout=(CFG.connect_to, CFG.read_to))
	if r.ok and "text" in (r.headers.get("content-type") or ""):
	return "[Retrieved from raw.githubusercontent.com]" + r.text[:CFG.text_cap]

	raise RuntimeError("github: unable to retrieve plain text")
	except Exception as e:
	logging.error(f"GitHub fetch failed for {url}: {e}")
	return _fetch_blocked_site(url)