The-Agency/lib/src/tools/web_fetch_tool.dart

863 lines
24 KiB
Dart

import "dart:convert";
import "dart:io";
import "dart:typed_data";
import "package:html/dom.dart" as dom;
import "package:html/parser.dart" as html_parser;
import "package:path/path.dart" as path;
import "../api/openrouter_client.dart";
import "base_tool.dart";
const int _maxUrlLength = 2000;
const int _maxFetchBytes = 10 * 1024 * 1024;
const int _maxPromptContentChars = 100000;
const int _maxRedirects = 10;
const Duration _cacheTtl = Duration(minutes: 15);
const int _maxCacheEntries = 64;
final List<_CacheKey> _cacheOrder = <_CacheKey>[];
final Map<_CacheKey, _CacheEntry> _cache = <_CacheKey, _CacheEntry>{};
const Set<String> _preapprovedHosts = {
"platform.claude.com",
"code.claude.com",
"modelcontextprotocol.io",
"docs.python.org",
"en.cppreference.com",
"docs.oracle.com",
"learn.microsoft.com",
"developer.mozilla.org",
"go.dev",
"pkg.go.dev",
"www.php.net",
"docs.swift.org",
"kotlinlang.org",
"ruby-doc.org",
"doc.rust-lang.org",
"www.typescriptlang.org",
"react.dev",
"angular.io",
"vuejs.org",
"nextjs.org",
"expressjs.com",
"nodejs.org",
"bun.sh",
"getbootstrap.com",
"tailwindcss.com",
"redux.js.org",
"webpack.js.org",
"jestjs.io",
"reactrouter.com",
"docs.djangoproject.com",
"flask.palletsprojects.com",
"fastapi.tiangolo.com",
"pandas.pydata.org",
"numpy.org",
"www.tensorflow.org",
"pytorch.org",
"scikit-learn.org",
"matplotlib.org",
"requests.readthedocs.io",
"jupyter.org",
"laravel.com",
"symfony.com",
"wordpress.org",
"docs.spring.io",
"hibernate.org",
"tomcat.apache.org",
"gradle.org",
"maven.apache.org",
"asp.net",
"dotnet.microsoft.com",
"nuget.org",
"reactnative.dev",
"docs.flutter.dev",
"developer.apple.com",
"developer.android.com",
"keras.io",
"spark.apache.org",
"huggingface.co",
"www.kaggle.com",
"www.mongodb.com",
"redis.io",
"www.postgresql.org",
"dev.mysql.com",
"www.sqlite.org",
"graphql.org",
"prisma.io",
"docs.aws.amazon.com",
"cloud.google.com",
"kubernetes.io",
"www.docker.com",
"www.terraform.io",
"www.ansible.com",
"docs.netlify.com",
"devcenter.heroku.com",
"cypress.io",
"selenium.dev",
"docs.unity.com",
"docs.unrealengine.com",
"git-scm.com",
"nginx.org",
"httpd.apache.org",
};
const Map<String, List<String>> _preapprovedPathPrefixes = {
"github.com": <String>["/anthropics"],
"vercel.com": <String>["/docs"],
};
class WebFetchTool extends BaseTool {
@override
final String name = "WebFetch";
@override
final String description =
"Fetch content from a URL, convert it to readable markdown-like text, and answer a prompt about that page.";
@override
Future<String> execute(Map<String, dynamic> input) async {
final rawUrl = requireString(input, "url").trim();
final prompt = requireString(input, "prompt").trim();
final apiKey = optionalString(input, "_api_key") ?? "";
final model = optionalString(input, "_model") ?? "openrouter/auto";
final permissionMode = optionalString(input, "_permission_mode") ?? "default";
final allowRules = _readStringList(input["_allow_rules"]);
final askRules = _readStringList(input["_ask_rules"]);
final denyRules = _readStringList(input["_deny_rules"]);
if (prompt.isEmpty) {
throw ArgumentError("prompt must not be empty");
}
if (apiKey.isEmpty) {
throw StateError("WebFetch requires an OpenRouter API key");
}
final url = _normalizeUrl(rawUrl);
final uri = Uri.parse(url);
_enforcePermissions(
uri: uri,
permissionMode: permissionMode,
allowRules: allowRules,
askRules: askRules,
denyRules: denyRules,
);
final isPreapproved = _isPreapprovedUrl(url);
final startTime = DateTime.now();
final fetched = await _fetchUrl(url);
final durationMs = DateTime.now().difference(startTime).inMilliseconds;
if (fetched.isRedirectNotice) {
return _formatOutput(
fetched: fetched,
durationMs: durationMs,
result: fetched.content,
);
}
final result = await _summarizeFetchedContent(
apiKey: apiKey,
model: model,
fetched: fetched,
prompt: prompt,
isPreapproved: isPreapproved,
);
return _formatOutput(
fetched: fetched,
durationMs: durationMs,
result: result,
);
}
String _normalizeUrl(String rawUrl) {
if (rawUrl.isEmpty || rawUrl.length > _maxUrlLength) {
throw ArgumentError("Invalid URL");
}
Uri uri;
try {
uri = Uri.parse(rawUrl);
} catch (_) {
throw ArgumentError("Invalid URL: $rawUrl");
}
if (!uri.hasScheme) {
throw ArgumentError("URL must include a scheme");
}
if (uri.userInfo.isNotEmpty || uri.host.isEmpty) {
throw ArgumentError("Invalid URL");
}
if (uri.scheme == "http") {
uri = uri.replace(scheme: "https");
}
if (uri.scheme != "https") {
throw ArgumentError("Only https URLs are supported");
}
final host = uri.host.toLowerCase();
if (_isLocalOrPrivateHost(host)) {
throw ArgumentError("Fetching local or private network URLs is not allowed");
}
return uri.toString();
}
void _enforcePermissions({
required Uri uri,
required String permissionMode,
required List<String> allowRules,
required List<String> askRules,
required List<String> denyRules,
}) {
final bypassModes = <String>{"bypassPermissions", "dontAsk"};
if (bypassModes.contains(permissionMode)) {
return;
}
final domainRule = "domain:${uri.host.toLowerCase()}";
final matchingDeny = denyRules.where((rule) => _matchesRule(rule, uri)).toList();
if (matchingDeny.isNotEmpty) {
throw StateError("WebFetch denied access to $domainRule.");
}
final matchingAllow = allowRules.where((rule) => _matchesRule(rule, uri)).toList();
if (matchingAllow.isNotEmpty) {
return;
}
final matchingAsk = askRules.where((rule) => _matchesRule(rule, uri)).toList();
if (matchingAsk.isNotEmpty) {
throw StateError(
"WebFetch requires permission for $domainRule. Add an allow rule to proceed.",
);
}
}
bool _matchesRule(String rawRule, Uri uri) {
var rule = rawRule.trim();
if (rule.startsWith("WebFetch(") && rule.endsWith(")")) {
rule = rule.substring("WebFetch(".length, rule.length - 1);
}
if (!rule.startsWith("domain:")) {
return false;
}
final pattern = rule.substring("domain:".length).toLowerCase();
final host = uri.host.toLowerCase();
if (pattern.isEmpty) {
return false;
}
if (pattern == host) {
return true;
}
if (pattern.startsWith("*.")) {
final suffix = pattern.substring(1);
return host.endsWith(suffix);
}
if (pattern.endsWith(".*")) {
final prefix = pattern.substring(0, pattern.length - 1);
return host.startsWith(prefix);
}
return false;
}
Future<_FetchedContent> _fetchUrl(String originalUrl) async {
_pruneCache();
final cacheKey = _CacheKey(originalUrl);
final cached = _cache[cacheKey];
if (cached != null && DateTime.now().difference(cached.fetchedAt) < _cacheTtl) {
_touchCacheEntry(cacheKey);
return cached.content;
}
final httpClient = HttpClient()..connectionTimeout = const Duration(seconds: 60);
try {
var currentUrl = Uri.parse(originalUrl);
final originalComparableHost = _stripWww(currentUrl.host);
for (var redirectCount = 0; redirectCount <= _maxRedirects; redirectCount++) {
final request = await httpClient.getUrl(currentUrl);
request.headers.set("Accept", "text/markdown, text/html, text/plain, */*");
request.headers.set("User-Agent", "clawd_code/0.1.0 (WebFetch)");
final response = await request.close().timeout(const Duration(seconds: 60));
final statusCode = response.statusCode;
final statusText = response.reasonPhrase;
final location = response.headers.value(HttpHeaders.locationHeader);
if (_isRedirect(statusCode) && location != null) {
final redirectUrl = currentUrl.resolve(location);
if (_stripWww(redirectUrl.host) != originalComparableHost) {
final redirectNotice = _FetchedContent(
finalUrl: currentUrl.toString(),
statusCode: statusCode,
reasonPhrase: statusText,
bytes: 0,
contentType: "text/plain",
content:
"REDIRECT DETECTED: The URL redirects to a different host.\n\n"
"Original URL: $currentUrl\n"
"Redirect URL: $redirectUrl\n"
"Status: $statusCode $statusText\n\n"
"To complete your request, use WebFetch again with these parameters:\n"
"- url: \"$redirectUrl\"",
isRedirectNotice: true,
);
_storeCacheEntry(cacheKey, redirectNotice);
return redirectNotice;
}
currentUrl = redirectUrl;
continue;
}
final bytes = await _readResponseBytes(response);
final contentType =
response.headers.contentType?.mimeType ?? "application/octet-stream";
final isBinary = _looksBinary(contentType, bytes);
final persistedBinary = isBinary
? await _persistBinaryContent(bytes, contentType)
: null;
final decodedText = _decodeBody(bytes, isBinary: isBinary);
final readableContent = _extractReadableContent(
decodedText,
contentType: contentType,
url: currentUrl.toString(),
);
final fetched = _FetchedContent(
finalUrl: currentUrl.toString(),
statusCode: statusCode,
reasonPhrase: statusText,
bytes: bytes.length,
contentType: contentType,
content: readableContent,
persistedBinaryPath: persistedBinary?.path,
persistedBinarySize: persistedBinary?.size,
);
_storeCacheEntry(cacheKey, fetched);
return fetched;
}
throw StateError("Too many redirects");
} finally {
httpClient.close();
}
}
Future<List<int>> _readResponseBytes(HttpClientResponse response) async {
final builder = BytesBuilder(copy: false);
await for (final chunk in response) {
builder.add(chunk);
if (builder.length > _maxFetchBytes) {
throw StateError("Response exceeded ${_maxFetchBytes} bytes");
}
}
return builder.takeBytes();
}
String _decodeBody(List<int> bytes, {required bool isBinary}) {
if (isBinary) {
return latin1.decode(bytes, allowInvalid: true);
}
try {
return utf8.decode(bytes);
} catch (_) {
return latin1.decode(bytes, allowInvalid: true);
}
}
Future<_PersistedBinary?> _persistBinaryContent(
List<int> bytes,
String contentType,
) async {
try {
final extension = _extensionForMimeType(contentType);
final fileName =
"webfetch-${DateTime.now().millisecondsSinceEpoch}-${_randomSuffix()}$extension";
final file = File(path.join(Directory.systemTemp.path, fileName));
await file.writeAsBytes(bytes, flush: true);
return _PersistedBinary(path: file.path, size: bytes.length);
} catch (_) {
return null;
}
}
String _extractReadableContent(
String rawContent, {
required String contentType,
required String url,
}) {
if (contentType.contains("markdown") || contentType.contains("plain")) {
return _truncateContent(rawContent.trim());
}
final document = html_parser.parse(rawContent);
document.querySelectorAll("script,style,noscript,svg,iframe").forEach((node) {
node.remove();
});
final title = document.querySelector("title")?.text.trim();
final description = document
.querySelector('meta[name="description"], meta[property="og:description"]')
?.attributes["content"]
?.trim();
final root =
document.querySelector("article") ??
document.querySelector("main") ??
document.body ??
document.documentElement;
if (root == null) {
throw StateError("No readable content found at $url");
}
final buffer = StringBuffer();
if (title != null && title.isNotEmpty) {
buffer.writeln("# $title");
buffer.writeln();
}
if (description != null && description.isNotEmpty) {
buffer.writeln(description);
buffer.writeln();
}
for (final node in root.nodes) {
_writeNode(node, buffer, listDepth: 0, inPre: false);
}
var result = buffer.toString();
result = result.replaceAll(RegExp(r"\n{3,}"), "\n\n").trim();
result = _decodeHtmlEntities(result);
if (result.isEmpty) {
throw StateError("No readable content found at $url");
}
return _truncateContent(result);
}
void _writeNode(
dom.Node node,
StringBuffer buffer, {
required int listDepth,
required bool inPre,
}) {
if (node is dom.Text) {
final text = inPre
? node.text
: node.text.replaceAll(RegExp(r"\s+"), " ");
if (text.trim().isNotEmpty) {
buffer.write(text);
}
return;
}
if (node is! dom.Element) {
return;
}
final tag = node.localName?.toLowerCase() ?? "";
switch (tag) {
case "h1":
case "h2":
case "h3":
case "h4":
case "h5":
case "h6":
final level = int.tryParse(tag.substring(1)) ?? 1;
buffer
..writeln()
..write("${"#" * level} ${node.text.trim()}")
..writeln()
..writeln();
return;
case "p":
_writeChildren(node, buffer, listDepth: listDepth, inPre: false);
buffer.writeln();
buffer.writeln();
return;
case "br":
buffer.writeln();
return;
case "pre":
final code = node.text.trimRight();
if (code.isNotEmpty) {
buffer
..writeln()
..writeln("```")
..writeln(code)
..writeln("```")
..writeln();
}
return;
case "code":
final code = node.text.replaceAll(RegExp(r"\s+"), " ").trim();
if (code.isNotEmpty) {
buffer.write("`$code`");
}
return;
case "ul":
case "ol":
buffer.writeln();
var index = 1;
for (final child in node.children.where((child) => child.localName == "li")) {
final prefix = tag == "ol" ? "${index++}." : "-";
buffer.write("${" " * listDepth}$prefix ");
_writeChildren(child, buffer, listDepth: listDepth + 1, inPre: false);
buffer.writeln();
}
buffer.writeln();
return;
case "li":
_writeChildren(node, buffer, listDepth: listDepth, inPre: false);
return;
case "a":
final label = node.text.replaceAll(RegExp(r"\s+"), " ").trim();
final href = node.attributes["href"]?.trim();
if (label.isNotEmpty && href != null && href.isNotEmpty) {
buffer.write("[$label]($href)");
} else {
_writeChildren(node, buffer, listDepth: listDepth, inPre: false);
}
return;
case "blockquote":
final quote = node.text.trim();
if (quote.isNotEmpty) {
buffer
..writeln()
..writeln("> ${quote.replaceAll("\n", "\n> ")}")
..writeln();
}
return;
case "table":
final tableText = node.text.replaceAll(RegExp(r"\s+"), " ").trim();
if (tableText.isNotEmpty) {
buffer
..writeln()
..writeln(tableText)
..writeln();
}
return;
case "hr":
buffer
..writeln()
..writeln("---")
..writeln();
return;
default:
final blockTags = <String>{
"article",
"section",
"main",
"div",
"header",
"footer",
"nav",
"aside",
};
final wasBlock = blockTags.contains(tag);
if (wasBlock) {
buffer.writeln();
}
_writeChildren(node, buffer, listDepth: listDepth, inPre: inPre);
if (wasBlock) {
buffer.writeln();
}
}
}
void _writeChildren(
dom.Element element,
StringBuffer buffer, {
required int listDepth,
required bool inPre,
}) {
for (final child in element.nodes) {
_writeNode(child, buffer, listDepth: listDepth, inPre: inPre);
}
}
Future<String> _summarizeFetchedContent({
required String apiKey,
required String model,
required _FetchedContent fetched,
required String prompt,
required bool isPreapproved,
}) async {
if (isPreapproved &&
fetched.contentType.contains("markdown") &&
fetched.content.length <= _maxPromptContentChars) {
return fetched.content;
}
final client = await OpenRouterClientFactory.create(apiKey: apiKey);
try {
final response = await client.createMessage(
model: model,
maxTokens: 2048,
messages: <Map<String, dynamic>>[
<String, dynamic>{
"role": "system",
"content": isPreapproved
? "Provide a concise response based on the fetched content. Include relevant details and code examples when present."
: "Provide a concise response based only on the fetched content. Use short quotes only when necessary.",
},
<String, dynamic>{
"role": "user",
"content":
"URL: ${fetched.finalUrl}\n"
"Content-Type: ${fetched.contentType}\n\n"
"Web page content:\n---\n${fetched.content}\n---\n\n"
"$prompt",
},
],
);
final parts = <String>[];
for (final block in response.content) {
if (block is Map<String, dynamic> && block["type"] == "text") {
final text = block["text"];
if (text is String && text.isNotEmpty) {
parts.add(text);
}
}
}
final result = parts.join("\n").trim();
return result.isEmpty ? "No response from model." : result;
} finally {
client.close();
}
}
String _formatOutput({
required _FetchedContent fetched,
required int durationMs,
required String result,
}) {
final lines = <String>[
"URL: ${fetched.finalUrl}",
"Status: ${fetched.statusCode} ${fetched.reasonPhrase}",
"Bytes: ${fetched.bytes}",
"Duration: ${_formatDuration(durationMs)}",
];
if (fetched.persistedBinaryPath != null && fetched.persistedBinarySize != null) {
lines.add(
"Binary content saved: ${fetched.persistedBinaryPath} (${fetched.persistedBinarySize} bytes)",
);
}
lines
..add("")
..add(result.trim());
return lines.join("\n");
}
void _pruneCache() {
final now = DateTime.now();
_cacheOrder.removeWhere((key) {
final entry = _cache[key];
final expired = entry == null || now.difference(entry.fetchedAt) >= _cacheTtl;
if (expired) {
_cache.remove(key);
}
return expired;
});
}
void _storeCacheEntry(_CacheKey key, _FetchedContent content) {
_cache[key] = _CacheEntry(content: content, fetchedAt: DateTime.now());
_touchCacheEntry(key);
while (_cacheOrder.length > _maxCacheEntries) {
final removed = _cacheOrder.removeAt(0);
_cache.remove(removed);
}
}
void _touchCacheEntry(_CacheKey key) {
_cacheOrder.remove(key);
_cacheOrder.add(key);
}
bool _isPreapprovedUrl(String url) {
final uri = Uri.parse(url);
final host = uri.host.toLowerCase();
if (_preapprovedHosts.contains(host)) {
return true;
}
final prefixes = _preapprovedPathPrefixes[host];
if (prefixes == null) {
return false;
}
final pathName = uri.path;
return prefixes.any(
(prefix) => pathName == prefix || pathName.startsWith("$prefix/"),
);
}
bool _isLocalOrPrivateHost(String host) {
final lower = host.toLowerCase();
if (lower == "localhost" || !lower.contains(".")) {
return true;
}
if (lower.endsWith(".local")) {
return true;
}
final ipv4 = RegExp(r"^(\d{1,3}\.){3}\d{1,3}$");
if (ipv4.hasMatch(lower)) {
final parts = lower.split(".").map(int.parse).toList();
if (parts.any((part) => part < 0 || part > 255)) {
return true;
}
return parts[0] == 10 ||
parts[0] == 127 ||
(parts[0] == 172 && parts[1] >= 16 && parts[1] <= 31) ||
(parts[0] == 192 && parts[1] == 168) ||
(parts[0] == 169 && parts[1] == 254);
}
if (lower == "::1" || lower.startsWith("fc") || lower.startsWith("fd")) {
return true;
}
return false;
}
bool _isRedirect(int statusCode) {
return statusCode == 301 ||
statusCode == 302 ||
statusCode == 307 ||
statusCode == 308;
}
bool _looksBinary(String contentType, List<int> bytes) {
if (contentType.startsWith("text/") ||
contentType.contains("json") ||
contentType.contains("xml") ||
contentType.contains("javascript") ||
contentType.contains("xhtml")) {
return false;
}
for (final byte in bytes.take(256)) {
if (byte == 0) {
return true;
}
}
return true;
}
String _truncateContent(String content) {
if (content.length <= _maxPromptContentChars) {
return content;
}
return "${content.substring(0, _maxPromptContentChars)}\n\n[Content truncated due to length...]";
}
String _decodeHtmlEntities(String text) {
return text
.replaceAll("&nbsp;", " ")
.replaceAll("&amp;", "&")
.replaceAll("&lt;", "<")
.replaceAll("&gt;", ">")
.replaceAll("&quot;", "\"")
.replaceAll("&#39;", "'")
.replaceAll("&#x27;", "'")
.replaceAll("&apos;", "'");
}
String _extensionForMimeType(String contentType) {
if (contentType.contains("pdf")) return ".pdf";
if (contentType.contains("zip")) return ".zip";
if (contentType.contains("png")) return ".png";
if (contentType.contains("jpeg")) return ".jpg";
if (contentType.contains("gif")) return ".gif";
if (contentType.contains("webp")) return ".webp";
if (contentType.contains("json")) return ".json";
return ".bin";
}
List<String> _readStringList(Object? value) {
if (value is! List) {
return const <String>[];
}
return value
.whereType<String>()
.map((item) => item.trim())
.where((item) => item.isNotEmpty)
.toList(growable: false);
}
String _randomSuffix() {
final radix = DateTime.now().microsecondsSinceEpoch.toRadixString(36);
return radix.substring(radix.length - 6);
}
String _stripWww(String host) => host.replaceFirst(RegExp(r"^www\."), "");
String _formatDuration(int durationMs) {
if (durationMs < 1000) {
return "${durationMs}ms";
}
return "${(durationMs / 1000).toStringAsFixed(1)}s";
}
}
class _FetchedContent {
const _FetchedContent({
required this.finalUrl,
required this.statusCode,
required this.reasonPhrase,
required this.bytes,
required this.contentType,
required this.content,
this.persistedBinaryPath,
this.persistedBinarySize,
this.isRedirectNotice = false,
});
final String finalUrl;
final int statusCode;
final String reasonPhrase;
final int bytes;
final String contentType;
final String content;
final String? persistedBinaryPath;
final int? persistedBinarySize;
final bool isRedirectNotice;
}
class _PersistedBinary {
const _PersistedBinary({required this.path, required this.size});
final String path;
final int size;
}
class _CacheEntry {
const _CacheEntry({required this.content, required this.fetchedAt});
final _FetchedContent content;
final DateTime fetchedAt;
}
class _CacheKey {
const _CacheKey(this.url);
final String url;
@override
bool operator ==(Object other) =>
identical(this, other) || other is _CacheKey && other.url == url;
@override
int get hashCode => url.hashCode;
}