import "dart:convert"; import "dart:io"; import "dart:typed_data"; import "package:html/dom.dart" as dom; import "package:html/parser.dart" as html_parser; import "package:path/path.dart" as path; import "../api/openrouter_client.dart"; import "base_tool.dart"; const int _maxUrlLength = 2000; const int _maxFetchBytes = 10 * 1024 * 1024; const int _maxPromptContentChars = 100000; const int _maxRedirects = 10; const Duration _cacheTtl = Duration(minutes: 15); const int _maxCacheEntries = 64; final List<_CacheKey> _cacheOrder = <_CacheKey>[]; final Map<_CacheKey, _CacheEntry> _cache = <_CacheKey, _CacheEntry>{}; const Set _preapprovedHosts = { "platform.claude.com", "code.claude.com", "modelcontextprotocol.io", "docs.python.org", "en.cppreference.com", "docs.oracle.com", "learn.microsoft.com", "developer.mozilla.org", "go.dev", "pkg.go.dev", "www.php.net", "docs.swift.org", "kotlinlang.org", "ruby-doc.org", "doc.rust-lang.org", "www.typescriptlang.org", "react.dev", "angular.io", "vuejs.org", "nextjs.org", "expressjs.com", "nodejs.org", "bun.sh", "getbootstrap.com", "tailwindcss.com", "redux.js.org", "webpack.js.org", "jestjs.io", "reactrouter.com", "docs.djangoproject.com", "flask.palletsprojects.com", "fastapi.tiangolo.com", "pandas.pydata.org", "numpy.org", "www.tensorflow.org", "pytorch.org", "scikit-learn.org", "matplotlib.org", "requests.readthedocs.io", "jupyter.org", "laravel.com", "symfony.com", "wordpress.org", "docs.spring.io", "hibernate.org", "tomcat.apache.org", "gradle.org", "maven.apache.org", "asp.net", "dotnet.microsoft.com", "nuget.org", "reactnative.dev", "docs.flutter.dev", "developer.apple.com", "developer.android.com", "keras.io", "spark.apache.org", "huggingface.co", "www.kaggle.com", "www.mongodb.com", "redis.io", "www.postgresql.org", "dev.mysql.com", "www.sqlite.org", "graphql.org", "prisma.io", "docs.aws.amazon.com", "cloud.google.com", "kubernetes.io", "www.docker.com", "www.terraform.io", "www.ansible.com", "docs.netlify.com", "devcenter.heroku.com", "cypress.io", "selenium.dev", "docs.unity.com", "docs.unrealengine.com", "git-scm.com", "nginx.org", "httpd.apache.org", }; const Map> _preapprovedPathPrefixes = { "github.com": ["/anthropics"], "vercel.com": ["/docs"], }; class WebFetchTool extends BaseTool { @override final String name = "WebFetch"; @override final String description = "Fetch content from a URL, convert it to readable markdown-like text, and answer a prompt about that page."; @override Future execute(Map input) async { final rawUrl = requireString(input, "url").trim(); final prompt = requireString(input, "prompt").trim(); final apiKey = optionalString(input, "_api_key") ?? ""; final model = optionalString(input, "_model") ?? "openrouter/auto"; final permissionMode = optionalString(input, "_permission_mode") ?? "default"; final allowRules = _readStringList(input["_allow_rules"]); final askRules = _readStringList(input["_ask_rules"]); final denyRules = _readStringList(input["_deny_rules"]); if (prompt.isEmpty) { throw ArgumentError("prompt must not be empty"); } if (apiKey.isEmpty) { throw StateError("WebFetch requires an OpenRouter API key"); } final url = _normalizeUrl(rawUrl); final uri = Uri.parse(url); _enforcePermissions( uri: uri, permissionMode: permissionMode, allowRules: allowRules, askRules: askRules, denyRules: denyRules, ); final isPreapproved = _isPreapprovedUrl(url); final startTime = DateTime.now(); final fetched = await _fetchUrl(url); final durationMs = DateTime.now().difference(startTime).inMilliseconds; if (fetched.isRedirectNotice) { return _formatOutput( fetched: fetched, durationMs: durationMs, result: fetched.content, ); } final result = await _summarizeFetchedContent( apiKey: apiKey, model: model, fetched: fetched, prompt: prompt, isPreapproved: isPreapproved, ); return _formatOutput( fetched: fetched, durationMs: durationMs, result: result, ); } String _normalizeUrl(String rawUrl) { if (rawUrl.isEmpty || rawUrl.length > _maxUrlLength) { throw ArgumentError("Invalid URL"); } Uri uri; try { uri = Uri.parse(rawUrl); } catch (_) { throw ArgumentError("Invalid URL: $rawUrl"); } if (!uri.hasScheme) { throw ArgumentError("URL must include a scheme"); } if (uri.userInfo.isNotEmpty || uri.host.isEmpty) { throw ArgumentError("Invalid URL"); } if (uri.scheme == "http") { uri = uri.replace(scheme: "https"); } if (uri.scheme != "https") { throw ArgumentError("Only https URLs are supported"); } final host = uri.host.toLowerCase(); if (_isLocalOrPrivateHost(host)) { throw ArgumentError("Fetching local or private network URLs is not allowed"); } return uri.toString(); } void _enforcePermissions({ required Uri uri, required String permissionMode, required List allowRules, required List askRules, required List denyRules, }) { final bypassModes = {"bypassPermissions", "dontAsk"}; if (bypassModes.contains(permissionMode)) { return; } final domainRule = "domain:${uri.host.toLowerCase()}"; final matchingDeny = denyRules.where((rule) => _matchesRule(rule, uri)).toList(); if (matchingDeny.isNotEmpty) { throw StateError("WebFetch denied access to $domainRule."); } final matchingAllow = allowRules.where((rule) => _matchesRule(rule, uri)).toList(); if (matchingAllow.isNotEmpty) { return; } final matchingAsk = askRules.where((rule) => _matchesRule(rule, uri)).toList(); if (matchingAsk.isNotEmpty) { throw StateError( "WebFetch requires permission for $domainRule. Add an allow rule to proceed.", ); } } bool _matchesRule(String rawRule, Uri uri) { var rule = rawRule.trim(); if (rule.startsWith("WebFetch(") && rule.endsWith(")")) { rule = rule.substring("WebFetch(".length, rule.length - 1); } if (!rule.startsWith("domain:")) { return false; } final pattern = rule.substring("domain:".length).toLowerCase(); final host = uri.host.toLowerCase(); if (pattern.isEmpty) { return false; } if (pattern == host) { return true; } if (pattern.startsWith("*.")) { final suffix = pattern.substring(1); return host.endsWith(suffix); } if (pattern.endsWith(".*")) { final prefix = pattern.substring(0, pattern.length - 1); return host.startsWith(prefix); } return false; } Future<_FetchedContent> _fetchUrl(String originalUrl) async { _pruneCache(); final cacheKey = _CacheKey(originalUrl); final cached = _cache[cacheKey]; if (cached != null && DateTime.now().difference(cached.fetchedAt) < _cacheTtl) { _touchCacheEntry(cacheKey); return cached.content; } final httpClient = HttpClient()..connectionTimeout = const Duration(seconds: 60); try { var currentUrl = Uri.parse(originalUrl); final originalComparableHost = _stripWww(currentUrl.host); for (var redirectCount = 0; redirectCount <= _maxRedirects; redirectCount++) { final request = await httpClient.getUrl(currentUrl); request.headers.set("Accept", "text/markdown, text/html, text/plain, */*"); request.headers.set("User-Agent", "clawd_code/0.1.0 (WebFetch)"); final response = await request.close().timeout(const Duration(seconds: 60)); final statusCode = response.statusCode; final statusText = response.reasonPhrase; final location = response.headers.value(HttpHeaders.locationHeader); if (_isRedirect(statusCode) && location != null) { final redirectUrl = currentUrl.resolve(location); if (_stripWww(redirectUrl.host) != originalComparableHost) { final redirectNotice = _FetchedContent( finalUrl: currentUrl.toString(), statusCode: statusCode, reasonPhrase: statusText, bytes: 0, contentType: "text/plain", content: "REDIRECT DETECTED: The URL redirects to a different host.\n\n" "Original URL: $currentUrl\n" "Redirect URL: $redirectUrl\n" "Status: $statusCode $statusText\n\n" "To complete your request, use WebFetch again with these parameters:\n" "- url: \"$redirectUrl\"", isRedirectNotice: true, ); _storeCacheEntry(cacheKey, redirectNotice); return redirectNotice; } currentUrl = redirectUrl; continue; } final bytes = await _readResponseBytes(response); final contentType = response.headers.contentType?.mimeType ?? "application/octet-stream"; final isBinary = _looksBinary(contentType, bytes); final persistedBinary = isBinary ? await _persistBinaryContent(bytes, contentType) : null; final decodedText = _decodeBody(bytes, isBinary: isBinary); final readableContent = _extractReadableContent( decodedText, contentType: contentType, url: currentUrl.toString(), ); final fetched = _FetchedContent( finalUrl: currentUrl.toString(), statusCode: statusCode, reasonPhrase: statusText, bytes: bytes.length, contentType: contentType, content: readableContent, persistedBinaryPath: persistedBinary?.path, persistedBinarySize: persistedBinary?.size, ); _storeCacheEntry(cacheKey, fetched); return fetched; } throw StateError("Too many redirects"); } finally { httpClient.close(); } } Future> _readResponseBytes(HttpClientResponse response) async { final builder = BytesBuilder(copy: false); await for (final chunk in response) { builder.add(chunk); if (builder.length > _maxFetchBytes) { throw StateError("Response exceeded ${_maxFetchBytes} bytes"); } } return builder.takeBytes(); } String _decodeBody(List bytes, {required bool isBinary}) { if (isBinary) { return latin1.decode(bytes, allowInvalid: true); } try { return utf8.decode(bytes); } catch (_) { return latin1.decode(bytes, allowInvalid: true); } } Future<_PersistedBinary?> _persistBinaryContent( List bytes, String contentType, ) async { try { final extension = _extensionForMimeType(contentType); final fileName = "webfetch-${DateTime.now().millisecondsSinceEpoch}-${_randomSuffix()}$extension"; final file = File(path.join(Directory.systemTemp.path, fileName)); await file.writeAsBytes(bytes, flush: true); return _PersistedBinary(path: file.path, size: bytes.length); } catch (_) { return null; } } String _extractReadableContent( String rawContent, { required String contentType, required String url, }) { if (contentType.contains("markdown") || contentType.contains("plain")) { return _truncateContent(rawContent.trim()); } final document = html_parser.parse(rawContent); document.querySelectorAll("script,style,noscript,svg,iframe").forEach((node) { node.remove(); }); final title = document.querySelector("title")?.text.trim(); final description = document .querySelector('meta[name="description"], meta[property="og:description"]') ?.attributes["content"] ?.trim(); final root = document.querySelector("article") ?? document.querySelector("main") ?? document.body ?? document.documentElement; if (root == null) { throw StateError("No readable content found at $url"); } final buffer = StringBuffer(); if (title != null && title.isNotEmpty) { buffer.writeln("# $title"); buffer.writeln(); } if (description != null && description.isNotEmpty) { buffer.writeln(description); buffer.writeln(); } for (final node in root.nodes) { _writeNode(node, buffer, listDepth: 0, inPre: false); } var result = buffer.toString(); result = result.replaceAll(RegExp(r"\n{3,}"), "\n\n").trim(); result = _decodeHtmlEntities(result); if (result.isEmpty) { throw StateError("No readable content found at $url"); } return _truncateContent(result); } void _writeNode( dom.Node node, StringBuffer buffer, { required int listDepth, required bool inPre, }) { if (node is dom.Text) { final text = inPre ? node.text : node.text.replaceAll(RegExp(r"\s+"), " "); if (text.trim().isNotEmpty) { buffer.write(text); } return; } if (node is! dom.Element) { return; } final tag = node.localName?.toLowerCase() ?? ""; switch (tag) { case "h1": case "h2": case "h3": case "h4": case "h5": case "h6": final level = int.tryParse(tag.substring(1)) ?? 1; buffer ..writeln() ..write("${"#" * level} ${node.text.trim()}") ..writeln() ..writeln(); return; case "p": _writeChildren(node, buffer, listDepth: listDepth, inPre: false); buffer.writeln(); buffer.writeln(); return; case "br": buffer.writeln(); return; case "pre": final code = node.text.trimRight(); if (code.isNotEmpty) { buffer ..writeln() ..writeln("```") ..writeln(code) ..writeln("```") ..writeln(); } return; case "code": final code = node.text.replaceAll(RegExp(r"\s+"), " ").trim(); if (code.isNotEmpty) { buffer.write("`$code`"); } return; case "ul": case "ol": buffer.writeln(); var index = 1; for (final child in node.children.where((child) => child.localName == "li")) { final prefix = tag == "ol" ? "${index++}." : "-"; buffer.write("${" " * listDepth}$prefix "); _writeChildren(child, buffer, listDepth: listDepth + 1, inPre: false); buffer.writeln(); } buffer.writeln(); return; case "li": _writeChildren(node, buffer, listDepth: listDepth, inPre: false); return; case "a": final label = node.text.replaceAll(RegExp(r"\s+"), " ").trim(); final href = node.attributes["href"]?.trim(); if (label.isNotEmpty && href != null && href.isNotEmpty) { buffer.write("[$label]($href)"); } else { _writeChildren(node, buffer, listDepth: listDepth, inPre: false); } return; case "blockquote": final quote = node.text.trim(); if (quote.isNotEmpty) { buffer ..writeln() ..writeln("> ${quote.replaceAll("\n", "\n> ")}") ..writeln(); } return; case "table": final tableText = node.text.replaceAll(RegExp(r"\s+"), " ").trim(); if (tableText.isNotEmpty) { buffer ..writeln() ..writeln(tableText) ..writeln(); } return; case "hr": buffer ..writeln() ..writeln("---") ..writeln(); return; default: final blockTags = { "article", "section", "main", "div", "header", "footer", "nav", "aside", }; final wasBlock = blockTags.contains(tag); if (wasBlock) { buffer.writeln(); } _writeChildren(node, buffer, listDepth: listDepth, inPre: inPre); if (wasBlock) { buffer.writeln(); } } } void _writeChildren( dom.Element element, StringBuffer buffer, { required int listDepth, required bool inPre, }) { for (final child in element.nodes) { _writeNode(child, buffer, listDepth: listDepth, inPre: inPre); } } Future _summarizeFetchedContent({ required String apiKey, required String model, required _FetchedContent fetched, required String prompt, required bool isPreapproved, }) async { if (isPreapproved && fetched.contentType.contains("markdown") && fetched.content.length <= _maxPromptContentChars) { return fetched.content; } final client = await OpenRouterClientFactory.create(apiKey: apiKey); try { final response = await client.createMessage( model: model, maxTokens: 2048, messages: >[ { "role": "system", "content": isPreapproved ? "Provide a concise response based on the fetched content. Include relevant details and code examples when present." : "Provide a concise response based only on the fetched content. Use short quotes only when necessary.", }, { "role": "user", "content": "URL: ${fetched.finalUrl}\n" "Content-Type: ${fetched.contentType}\n\n" "Web page content:\n---\n${fetched.content}\n---\n\n" "$prompt", }, ], ); final parts = []; for (final block in response.content) { if (block is Map && block["type"] == "text") { final text = block["text"]; if (text is String && text.isNotEmpty) { parts.add(text); } } } final result = parts.join("\n").trim(); return result.isEmpty ? "No response from model." : result; } finally { client.close(); } } String _formatOutput({ required _FetchedContent fetched, required int durationMs, required String result, }) { final lines = [ "URL: ${fetched.finalUrl}", "Status: ${fetched.statusCode} ${fetched.reasonPhrase}", "Bytes: ${fetched.bytes}", "Duration: ${_formatDuration(durationMs)}", ]; if (fetched.persistedBinaryPath != null && fetched.persistedBinarySize != null) { lines.add( "Binary content saved: ${fetched.persistedBinaryPath} (${fetched.persistedBinarySize} bytes)", ); } lines ..add("") ..add(result.trim()); return lines.join("\n"); } void _pruneCache() { final now = DateTime.now(); _cacheOrder.removeWhere((key) { final entry = _cache[key]; final expired = entry == null || now.difference(entry.fetchedAt) >= _cacheTtl; if (expired) { _cache.remove(key); } return expired; }); } void _storeCacheEntry(_CacheKey key, _FetchedContent content) { _cache[key] = _CacheEntry(content: content, fetchedAt: DateTime.now()); _touchCacheEntry(key); while (_cacheOrder.length > _maxCacheEntries) { final removed = _cacheOrder.removeAt(0); _cache.remove(removed); } } void _touchCacheEntry(_CacheKey key) { _cacheOrder.remove(key); _cacheOrder.add(key); } bool _isPreapprovedUrl(String url) { final uri = Uri.parse(url); final host = uri.host.toLowerCase(); if (_preapprovedHosts.contains(host)) { return true; } final prefixes = _preapprovedPathPrefixes[host]; if (prefixes == null) { return false; } final pathName = uri.path; return prefixes.any( (prefix) => pathName == prefix || pathName.startsWith("$prefix/"), ); } bool _isLocalOrPrivateHost(String host) { final lower = host.toLowerCase(); if (lower == "localhost" || !lower.contains(".")) { return true; } if (lower.endsWith(".local")) { return true; } final ipv4 = RegExp(r"^(\d{1,3}\.){3}\d{1,3}$"); if (ipv4.hasMatch(lower)) { final parts = lower.split(".").map(int.parse).toList(); if (parts.any((part) => part < 0 || part > 255)) { return true; } return parts[0] == 10 || parts[0] == 127 || (parts[0] == 172 && parts[1] >= 16 && parts[1] <= 31) || (parts[0] == 192 && parts[1] == 168) || (parts[0] == 169 && parts[1] == 254); } if (lower == "::1" || lower.startsWith("fc") || lower.startsWith("fd")) { return true; } return false; } bool _isRedirect(int statusCode) { return statusCode == 301 || statusCode == 302 || statusCode == 307 || statusCode == 308; } bool _looksBinary(String contentType, List bytes) { if (contentType.startsWith("text/") || contentType.contains("json") || contentType.contains("xml") || contentType.contains("javascript") || contentType.contains("xhtml")) { return false; } for (final byte in bytes.take(256)) { if (byte == 0) { return true; } } return true; } String _truncateContent(String content) { if (content.length <= _maxPromptContentChars) { return content; } return "${content.substring(0, _maxPromptContentChars)}\n\n[Content truncated due to length...]"; } String _decodeHtmlEntities(String text) { return text .replaceAll(" ", " ") .replaceAll("&", "&") .replaceAll("<", "<") .replaceAll(">", ">") .replaceAll(""", "\"") .replaceAll("'", "'") .replaceAll("'", "'") .replaceAll("'", "'"); } String _extensionForMimeType(String contentType) { if (contentType.contains("pdf")) return ".pdf"; if (contentType.contains("zip")) return ".zip"; if (contentType.contains("png")) return ".png"; if (contentType.contains("jpeg")) return ".jpg"; if (contentType.contains("gif")) return ".gif"; if (contentType.contains("webp")) return ".webp"; if (contentType.contains("json")) return ".json"; return ".bin"; } List _readStringList(Object? value) { if (value is! List) { return const []; } return value .whereType() .map((item) => item.trim()) .where((item) => item.isNotEmpty) .toList(growable: false); } String _randomSuffix() { final radix = DateTime.now().microsecondsSinceEpoch.toRadixString(36); return radix.substring(radix.length - 6); } String _stripWww(String host) => host.replaceFirst(RegExp(r"^www\."), ""); String _formatDuration(int durationMs) { if (durationMs < 1000) { return "${durationMs}ms"; } return "${(durationMs / 1000).toStringAsFixed(1)}s"; } } class _FetchedContent { const _FetchedContent({ required this.finalUrl, required this.statusCode, required this.reasonPhrase, required this.bytes, required this.contentType, required this.content, this.persistedBinaryPath, this.persistedBinarySize, this.isRedirectNotice = false, }); final String finalUrl; final int statusCode; final String reasonPhrase; final int bytes; final String contentType; final String content; final String? persistedBinaryPath; final int? persistedBinarySize; final bool isRedirectNotice; } class _PersistedBinary { const _PersistedBinary({required this.path, required this.size}); final String path; final int size; } class _CacheEntry { const _CacheEntry({required this.content, required this.fetchedAt}); final _FetchedContent content; final DateTime fetchedAt; } class _CacheKey { const _CacheKey(this.url); final String url; @override bool operator ==(Object other) => identical(this, other) || other is _CacheKey && other.url == url; @override int get hashCode => url.hashCode; }