diff --git a/apps/workers/network.ts b/apps/workers/network.ts new file mode 100644 index 00000000..acfd2439 --- /dev/null +++ b/apps/workers/network.ts @@ -0,0 +1,419 @@ +import dns from "node:dns/promises"; +import type { HeadersInit, RequestInit, Response } from "node-fetch"; +import { HttpProxyAgent } from "http-proxy-agent"; +import { HttpsProxyAgent } from "https-proxy-agent"; +import ipaddr from "ipaddr.js"; +import { LRUCache } from "lru-cache"; +import fetch, { Headers } from "node-fetch"; + +import serverConfig from "@karakeep/shared/config"; + +const DISALLOWED_IP_RANGES = new Set([ + // IPv4 ranges + "unspecified", + "broadcast", + "multicast", + "linkLocal", + "loopback", + "private", + "reserved", + "carrierGradeNat", + // IPv6 ranges + "uniqueLocal", + "6to4", // RFC 3056 - IPv6 transition mechanism + "teredo", // RFC 4380 - IPv6 tunneling + "benchmarking", // RFC 5180 - benchmarking addresses + "deprecated", // RFC 3879 - deprecated IPv6 addresses + "discard", // RFC 6666 - discard-only prefix +]); + +// DNS cache with 5 minute TTL and max 1000 entries +const dnsCache = new LRUCache({ + max: 1000, + ttl: 5 * 60 * 1000, // 5 minutes in milliseconds +}); + +async function resolveHostAddresses(hostname: string): Promise { + const resolver = new dns.Resolver({ + timeout: serverConfig.crawler.ipValidation.dnsResolverTimeoutSec * 1000, + }); + + const results = await Promise.allSettled([ + resolver.resolve4(hostname), + resolver.resolve6(hostname), + ]); + + const addresses: string[] = []; + const errors: string[] = []; + + for (const result of results) { + if (result.status === "fulfilled") { + addresses.push(...result.value); + } else { + const reason = result.reason; + if (reason instanceof Error) { + errors.push(reason.message); + } else { + errors.push(String(reason)); + } + } + } + + if (addresses.length > 0) { + return addresses; + } + + const errorMessage = + errors.length > 0 + ? errors.join("; ") + : "DNS lookup did not return any A or AAAA records"; + throw new Error(errorMessage); +} + +function isAddressForbidden(address: string): boolean { + if (!ipaddr.isValid(address)) { + return true; + } + const parsed = ipaddr.parse(address); + if ( + parsed.kind() === "ipv6" && + (parsed as ipaddr.IPv6).isIPv4MappedAddress() + ) { + const mapped = (parsed as ipaddr.IPv6).toIPv4Address(); + return DISALLOWED_IP_RANGES.has(mapped.range()); + } + return DISALLOWED_IP_RANGES.has(parsed.range()); +} + +export type UrlValidationResult = + | { ok: true; url: URL } + | { ok: false; reason: string }; + +function hostnameMatchesAnyPattern( + hostname: string, + patterns: string[], +): boolean { + function hostnameMatchesPattern(hostname: string, pattern: string): boolean { + return ( + pattern === hostname || + (pattern.startsWith(".") && hostname.endsWith(pattern)) || + hostname.endsWith("." + pattern) + ); + } + + for (const pattern of patterns) { + if (hostnameMatchesPattern(hostname, pattern)) { + return true; + } + } + return false; +} + +function isHostnameAllowedForInternalAccess(hostname: string): boolean { + if (!serverConfig.allowedInternalHostnames) { + return false; + } + return hostnameMatchesAnyPattern( + hostname, + serverConfig.allowedInternalHostnames, + ); +} + +export async function validateUrl( + urlCandidate: string, + runningInProxyContext: boolean, +): Promise { + let parsedUrl: URL; + try { + parsedUrl = new URL(urlCandidate); + } catch (error) { + return { + ok: false, + reason: `Invalid URL "${urlCandidate}": ${ + error instanceof Error ? error.message : String(error) + }`, + } as const; + } + + if (parsedUrl.protocol !== "http:" && parsedUrl.protocol !== "https:") { + return { + ok: false, + reason: `Unsupported protocol for URL: ${parsedUrl.toString()}`, + } as const; + } + + const hostname = parsedUrl.hostname; + if (!hostname) { + return { + ok: false, + reason: `URL ${parsedUrl.toString()} must include a hostname`, + } as const; + } + + if (isHostnameAllowedForInternalAccess(hostname)) { + return { ok: true, url: parsedUrl } as const; + } + + if (ipaddr.isValid(hostname)) { + if (isAddressForbidden(hostname)) { + return { + ok: false, + reason: `Refusing to access disallowed IP address ${hostname} (requested via ${parsedUrl.toString()})`, + } as const; + } + return { ok: true, url: parsedUrl } as const; + } + + if (runningInProxyContext) { + // If we're running in a proxy context, we must skip DNS resolution + // as the DNS resolution will be handled by the proxy + return { ok: true, url: parsedUrl } as const; + } + + // Check cache first + let records = dnsCache.get(hostname); + + if (!records) { + // Cache miss or expired - perform DNS resolution + try { + records = await resolveHostAddresses(hostname); + dnsCache.set(hostname, records); + } catch (error) { + return { + ok: false, + reason: `Failed to resolve hostname ${hostname}: ${ + error instanceof Error ? error.message : String(error) + }`, + } as const; + } + } + + if (!records || records.length === 0) { + return { + ok: false, + reason: `DNS lookup for ${hostname} did not return any addresses (requested via ${parsedUrl.toString()})`, + } as const; + } + + for (const record of records) { + if (isAddressForbidden(record)) { + return { + ok: false, + reason: `Refusing to access disallowed resolved address ${record} for host ${hostname}`, + } as const; + } + } + + return { ok: true, url: parsedUrl } as const; +} + +export function getRandomProxy(proxyList: string[]): string { + return proxyList[Math.floor(Math.random() * proxyList.length)].trim(); +} + +export function matchesNoProxy(url: string, noProxy: string[]) { + const urlObj = new URL(url); + const hostname = urlObj.hostname; + return hostnameMatchesAnyPattern(hostname, noProxy); +} + +export function getProxyAgent(url: string) { + const { proxy } = serverConfig; + + if (!proxy.httpProxy && !proxy.httpsProxy) { + return undefined; + } + + const urlObj = new URL(url); + const protocol = urlObj.protocol; + + // Check if URL should bypass proxy + if (proxy.noProxy && matchesNoProxy(url, proxy.noProxy)) { + return undefined; + } + + if (protocol === "https:" && proxy.httpsProxy) { + const selectedProxy = getRandomProxy(proxy.httpsProxy); + return new HttpsProxyAgent(selectedProxy); + } else if (protocol === "http:" && proxy.httpProxy) { + const selectedProxy = getRandomProxy(proxy.httpProxy); + return new HttpProxyAgent(selectedProxy); + } else if (proxy.httpProxy) { + const selectedProxy = getRandomProxy(proxy.httpProxy); + return new HttpProxyAgent(selectedProxy); + } + + return undefined; +} + +function cloneHeaders(init?: HeadersInit): Headers { + const headers = new Headers(); + if (!init) { + return headers; + } + if (init instanceof Headers) { + init.forEach((value, key) => { + headers.set(key, value); + }); + return headers; + } + + if (Array.isArray(init)) { + for (const [key, value] of init) { + headers.append(key, value); + } + return headers; + } + + for (const [key, value] of Object.entries(init)) { + if (Array.isArray(value)) { + headers.set(key, value.join(", ")); + } else if (value !== undefined) { + headers.set(key, value); + } + } + + return headers; +} + +function isRedirectResponse(response: Response): boolean { + return ( + response.status === 301 || + response.status === 302 || + response.status === 303 || + response.status === 307 || + response.status === 308 + ); +} + +export type FetchWithProxyOptions = Omit< + RequestInit & { + maxRedirects?: number; + }, + "agent" +>; + +interface PreparedFetchOptions { + maxRedirects: number; + baseHeaders: Headers; + method: string; + body?: RequestInit["body"]; + baseOptions: RequestInit; +} + +export function prepareFetchOptions( + options: FetchWithProxyOptions = {}, +): PreparedFetchOptions { + const { + maxRedirects = 5, + headers: initHeaders, + method: initMethod, + body: initBody, + redirect: _ignoredRedirect, + ...restOptions + } = options; + + const baseOptions = restOptions as RequestInit; + + return { + maxRedirects, + baseHeaders: cloneHeaders(initHeaders), + method: initMethod?.toUpperCase?.() ?? "GET", + body: initBody, + baseOptions, + }; +} + +interface BuildFetchOptionsInput { + method: string; + body?: RequestInit["body"]; + headers: Headers; + agent?: RequestInit["agent"]; + baseOptions: RequestInit; +} + +export function buildFetchOptions({ + method, + body, + headers, + agent, + baseOptions, +}: BuildFetchOptionsInput): RequestInit { + return { + ...baseOptions, + method, + body, + headers, + agent, + redirect: "manual", + }; +} + +export const fetchWithProxy = async ( + url: string, + options: FetchWithProxyOptions = {}, +) => { + const { + maxRedirects, + baseHeaders, + method: preparedMethod, + body: preparedBody, + baseOptions, + } = prepareFetchOptions(options); + + let redirectsRemaining = maxRedirects; + let currentUrl = url; + let currentMethod = preparedMethod; + let currentBody = preparedBody; + + while (true) { + const agent = getProxyAgent(currentUrl); + + const validation = await validateUrl(currentUrl, !!agent); + if (!validation.ok) { + throw new Error(validation.reason); + } + const requestUrl = validation.url; + currentUrl = requestUrl.toString(); + + const response = await fetch( + currentUrl, + buildFetchOptions({ + method: currentMethod, + body: currentBody, + headers: baseHeaders, + agent, + baseOptions, + }), + ); + + if (!isRedirectResponse(response)) { + return response; + } + + const locationHeader = response.headers.get("location"); + if (!locationHeader) { + return response; + } + + if (redirectsRemaining <= 0) { + throw new Error(`Too many redirects while fetching ${url}`); + } + + const nextUrl = new URL(locationHeader, currentUrl); + + if ( + response.status === 303 || + ((response.status === 301 || response.status === 302) && + currentMethod !== "GET" && + currentMethod !== "HEAD") + ) { + currentMethod = "GET"; + currentBody = undefined; + baseHeaders.delete("content-length"); + } + + currentUrl = nextUrl.toString(); + redirectsRemaining -= 1; + } +}; diff --git a/apps/workers/package.json b/apps/workers/package.json index b02c3bc9..f35a52f4 100644 --- a/apps/workers/package.json +++ b/apps/workers/package.json @@ -23,8 +23,10 @@ "hono": "^4.7.10", "http-proxy-agent": "^7.0.2", "https-proxy-agent": "^7.0.6", + "ipaddr.js": "^2.2.0", "jsdom": "^24.0.0", "liteque": "^0.6.2", + "lru-cache": "^11.2.2", "metascraper": "^5.49.5", "metascraper-amazon": "^5.49.5", "metascraper-author": "^5.49.5", diff --git a/apps/workers/utils.ts b/apps/workers/utils.ts index a82dd12d..2f56d3f0 100644 --- a/apps/workers/utils.ts +++ b/apps/workers/utils.ts @@ -1,9 +1,3 @@ -import { HttpProxyAgent } from "http-proxy-agent"; -import { HttpsProxyAgent } from "https-proxy-agent"; -import fetch from "node-fetch"; - -import serverConfig from "@karakeep/shared/config"; - export function withTimeout( func: (param: T) => Promise, timeoutSec: number, @@ -20,58 +14,3 @@ export function withTimeout( ]); }; } - -export function getRandomProxy(proxyList: string[]): string { - return proxyList[Math.floor(Math.random() * proxyList.length)].trim(); -} - -function getProxyAgent(url: string) { - const { proxy } = serverConfig; - - if (!proxy.httpProxy && !proxy.httpsProxy) { - return undefined; - } - - const urlObj = new URL(url); - const protocol = urlObj.protocol; - - // Check if URL should bypass proxy - if (proxy.noProxy) { - const noProxyList = proxy.noProxy.split(",").map((host) => host.trim()); - const hostname = urlObj.hostname; - - for (const noProxyHost of noProxyList) { - if ( - noProxyHost === hostname || - (noProxyHost.startsWith(".") && hostname.endsWith(noProxyHost)) || - hostname.endsWith("." + noProxyHost) - ) { - return undefined; - } - } - } - - if (protocol === "https:" && proxy.httpsProxy) { - const selectedProxy = getRandomProxy(proxy.httpsProxy); - return new HttpsProxyAgent(selectedProxy); - } else if (protocol === "http:" && proxy.httpProxy) { - const selectedProxy = getRandomProxy(proxy.httpProxy); - return new HttpProxyAgent(selectedProxy); - } else if (proxy.httpProxy) { - const selectedProxy = getRandomProxy(proxy.httpProxy); - return new HttpProxyAgent(selectedProxy); - } - - return undefined; -} - -export const fetchWithProxy = ( - url: string, - options: Record = {}, -) => { - const agent = getProxyAgent(url); - if (agent) { - options.agent = agent; - } - return fetch(url, options); -}; diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts index 33ff2851..70b2e644 100644 --- a/apps/workers/workers/crawlerWorker.ts +++ b/apps/workers/workers/crawlerWorker.ts @@ -25,10 +25,15 @@ import metascraperTitle from "metascraper-title"; import metascraperTwitter from "metascraper-twitter"; import metascraperUrl from "metascraper-url"; import { workerStatsCounter } from "metrics"; +import { + fetchWithProxy, + getRandomProxy, + matchesNoProxy, + validateUrl, +} from "network"; import { Browser, BrowserContextOptions } from "playwright"; import { chromium } from "playwright-extra"; import StealthPlugin from "puppeteer-extra-plugin-stealth"; -import { fetchWithProxy, getRandomProxy } from "utils"; import { getBookmarkDetails, updateAsset } from "workerUtils"; import { z } from "zod"; @@ -173,7 +178,7 @@ function getPlaywrightProxyConfig(): BrowserContextOptions["proxy"] { server: proxyUrl, username: parsed.username, password: parsed.password, - bypass: proxy.noProxy, + bypass: proxy.noProxy?.join(","), }; } @@ -355,22 +360,6 @@ async function changeBookmarkStatus( .where(eq(bookmarkLinks.id, bookmarkId)); } -/** - * This provides some "basic" protection from malicious URLs. However, all of those - * can be easily circumvented by pointing dns of origin to localhost, or with - * redirects. - */ -function validateUrl(url: string) { - const urlParsed = new URL(url); - if (urlParsed.protocol != "http:" && urlParsed.protocol != "https:") { - throw new Error(`Unsupported URL protocol: ${urlParsed.protocol}`); - } - - if (["localhost", "127.0.0.1", "0.0.0.0"].includes(urlParsed.hostname)) { - throw new Error(`Link hostname rejected: ${urlParsed.hostname}`); - } -} - async function browserlessCrawlPage( jobId: string, url: string, @@ -430,11 +419,15 @@ async function crawlPage( return browserlessCrawlPage(jobId, url, abortSignal); } + const proxyConfig = getPlaywrightProxyConfig(); + const isRunningInProxyContext = + proxyConfig !== undefined && + !matchesNoProxy(url, proxyConfig.bypass?.split(",") ?? []); const context = await browser.newContext({ viewport: { width: 1440, height: 900 }, userAgent: "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36", - proxy: getPlaywrightProxyConfig(), + proxy: proxyConfig, }); try { @@ -453,8 +446,12 @@ async function crawlPage( await globalBlocker.enableBlockingInPage(page); } - // Block audio/video resources - await page.route("**/*", (route) => { + // Block audio/video resources and disallowed sub-requests + await page.route("**/*", async (route) => { + if (abortSignal.aborted) { + await route.abort("aborted"); + return; + } const request = route.request(); const resourceType = request.resourceType(); @@ -464,18 +461,49 @@ async function crawlPage( request.headers()["content-type"]?.includes("video/") || request.headers()["content-type"]?.includes("audio/") ) { - route.abort(); + await route.abort("aborted"); return; } + const requestUrl = request.url(); + const requestIsRunningInProxyContext = + proxyConfig !== undefined && + !matchesNoProxy(requestUrl, proxyConfig.bypass?.split(",") ?? []); + if ( + requestUrl.startsWith("http://") || + requestUrl.startsWith("https://") + ) { + const validation = await validateUrl( + requestUrl, + requestIsRunningInProxyContext, + ); + if (!validation.ok) { + logger.warn( + `[Crawler][${jobId}] Blocking sub-request to disallowed URL "${requestUrl}": ${validation.reason}`, + ); + await route.abort("blockedbyclient"); + return; + } + } + // Continue with other requests - route.continue(); + await route.continue(); }); // Navigate to the target URL - logger.info(`[Crawler][${jobId}] Navigating to "${url}"`); + const navigationValidation = await validateUrl( + url, + isRunningInProxyContext, + ); + if (!navigationValidation.ok) { + throw new Error( + `Disallowed navigation target "${url}": ${navigationValidation.reason}`, + ); + } + const targetUrl = navigationValidation.url.toString(); + logger.info(`[Crawler][${jobId}] Navigating to "${targetUrl}"`); const response = await Promise.race([ - page.goto(url, { + page.goto(targetUrl, { timeout: serverConfig.crawler.navigateTimeoutSec * 1000, waitUntil: "domcontentloaded", }), @@ -483,7 +511,7 @@ async function crawlPage( ]); logger.info( - `[Crawler][${jobId}] Successfully navigated to "${url}". Waiting for the page to load ...`, + `[Crawler][${jobId}] Successfully navigated to "${targetUrl}". Waiting for the page to load ...`, ); // Wait until network is relatively idle or timeout after 5 seconds @@ -1231,7 +1259,6 @@ async function runCrawler(job: DequeuedJob) { logger.info( `[Crawler][${jobId}] Will crawl "${url}" for link with id "${bookmarkId}"`, ); - validateUrl(url); const contentType = await getContentType(url, jobId, job.abortSignal); job.abortSignal.throwIfAborted(); diff --git a/apps/workers/workers/feedWorker.ts b/apps/workers/workers/feedWorker.ts index 38b06c47..f86e7424 100644 --- a/apps/workers/workers/feedWorker.ts +++ b/apps/workers/workers/feedWorker.ts @@ -1,9 +1,9 @@ import { and, eq, inArray } from "drizzle-orm"; import { workerStatsCounter } from "metrics"; +import { fetchWithProxy } from "network"; import cron from "node-cron"; import Parser from "rss-parser"; import { buildImpersonatingTRPCClient } from "trpc"; -import { fetchWithProxy } from "utils"; import { z } from "zod"; import type { ZFeedRequestSchema } from "@karakeep/shared-server"; diff --git a/apps/workers/workers/videoWorker.ts b/apps/workers/workers/videoWorker.ts index a41eb069..8d3ac666 100644 --- a/apps/workers/workers/videoWorker.ts +++ b/apps/workers/workers/videoWorker.ts @@ -3,6 +3,7 @@ import * as os from "os"; import path from "path"; import { execa } from "execa"; import { workerStatsCounter } from "metrics"; +import { getProxyAgent, validateUrl } from "network"; import { db } from "@karakeep/db"; import { AssetTypes } from "@karakeep/db/schema"; @@ -62,7 +63,11 @@ export class VideoWorker { } } -function prepareYtDlpArguments(url: string, assetPath: string) { +function prepareYtDlpArguments( + url: string, + proxy: string | undefined, + assetPath: string, +) { const ytDlpArguments = [url]; if (serverConfig.crawler.maxVideoDownloadSize > 0) { ytDlpArguments.push( @@ -74,6 +79,9 @@ function prepareYtDlpArguments(url: string, assetPath: string) { ytDlpArguments.push(...serverConfig.crawler.ytDlpArguments); ytDlpArguments.push("-o", assetPath); ytDlpArguments.push("--no-playlist"); + if (proxy) { + ytDlpArguments.push("--proxy", proxy); + } return ytDlpArguments; } @@ -94,15 +102,29 @@ async function runWorker(job: DequeuedJob) { return; } + const proxy = getProxyAgent(url); + const validation = await validateUrl(url, !!proxy); + if (!validation.ok) { + logger.warn( + `[VideoCrawler][${jobId}] Skipping video download to disallowed URL "${url}": ${validation.reason}`, + ); + return; + } + const normalizedUrl = validation.url.toString(); + const videoAssetId = newAssetId(); let assetPath = `${TMP_FOLDER}/${videoAssetId}`; await fs.promises.mkdir(TMP_FOLDER, { recursive: true }); - const ytDlpArguments = prepareYtDlpArguments(url, assetPath); + const ytDlpArguments = prepareYtDlpArguments( + normalizedUrl, + proxy?.proxy.toString(), + assetPath, + ); try { logger.info( - `[VideoCrawler][${jobId}] Attempting to download a file from "${url}" to "${assetPath}" using the following arguments: "${ytDlpArguments}"`, + `[VideoCrawler][${jobId}] Attempting to download a file from "${normalizedUrl}" to "${assetPath}" using the following arguments: "${ytDlpArguments}"`, ); await execa("yt-dlp", ytDlpArguments, { @@ -123,11 +145,11 @@ async function runWorker(job: DequeuedJob) { err.message.includes("No media found") ) { logger.info( - `[VideoCrawler][${jobId}] Skipping video download from "${url}", because it's not one of the supported yt-dlp URLs`, + `[VideoCrawler][${jobId}] Skipping video download from "${normalizedUrl}", because it's not one of the supported yt-dlp URLs`, ); return; } - const genericError = `[VideoCrawler][${jobId}] Failed to download a file from "${url}" to "${assetPath}"`; + const genericError = `[VideoCrawler][${jobId}] Failed to download a file from "${normalizedUrl}" to "${assetPath}"`; if ("stderr" in err) { logger.error(`${genericError}: ${err.stderr}`); } else { @@ -138,7 +160,7 @@ async function runWorker(job: DequeuedJob) { } logger.info( - `[VideoCrawler][${jobId}] Finished downloading a file from "${url}" to "${assetPath}"`, + `[VideoCrawler][${jobId}] Finished downloading a file from "${normalizedUrl}" to "${assetPath}"`, ); // Get file size and check quota before saving @@ -177,7 +199,7 @@ async function runWorker(job: DequeuedJob) { await silentDeleteAsset(userId, oldVideoAssetId); logger.info( - `[VideoCrawler][${jobId}] Finished downloading video from "${url}" and adding it to the database`, + `[VideoCrawler][${jobId}] Finished downloading video from "${normalizedUrl}" and adding it to the database`, ); } catch (error) { if (error instanceof StorageQuotaError) { diff --git a/apps/workers/workers/webhookWorker.ts b/apps/workers/workers/webhookWorker.ts index 2bbef160..472a27ed 100644 --- a/apps/workers/workers/webhookWorker.ts +++ b/apps/workers/workers/webhookWorker.ts @@ -1,6 +1,6 @@ import { eq } from "drizzle-orm"; import { workerStatsCounter } from "metrics"; -import fetch from "node-fetch"; +import { fetchWithProxy } from "network"; import { db } from "@karakeep/db"; import { bookmarks, webhooksTable } from "@karakeep/db/schema"; @@ -102,7 +102,7 @@ async function runWebhook(job: DequeuedJob) { while (attempt < maxRetries && !success) { try { - const response = await fetch(url, { + const response = await fetchWithProxy(url, { method: "POST", headers: { "Content-Type": "application/json", diff --git a/docs/docs/03-configuration.md b/docs/docs/03-configuration.md index 26760d6c..50280a55 100644 --- a/docs/docs/03-configuration.md +++ b/docs/docs/03-configuration.md @@ -222,11 +222,12 @@ Karakeep can send emails for various purposes such as email verification during If your Karakeep instance needs to connect through a proxy server, you can configure the following settings: -| Name | Required | Default | Description | -| ------------------- | -------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | -| CRAWLER_HTTP_PROXY | No | Not set | HTTP proxy server URL for outgoing HTTP requests (e.g., `http://proxy.example.com:8080`). You can pass multiple comma separated proxies and the used one will be chosen at random. | -| CRAWLER_HTTPS_PROXY | No | Not set | HTTPS proxy server URL for outgoing HTTPS requests (e.g., `http://proxy.example.com:8080`). You can pass multiple comma separated proxies and the used one will be chosen at random. | -| CRAWLER_NO_PROXY | No | Not set | Comma-separated list of hostnames/IPs that should bypass the proxy (e.g., `localhost,127.0.0.1,.local`) | +| Name | Required | Default | Description | +| ---------------------------------- | -------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| CRAWLER_HTTP_PROXY | No | Not set | HTTP proxy server URL for outgoing HTTP requests (e.g., `http://proxy.example.com:8080`). You can pass multiple comma separated proxies and the used one will be chosen at random. The proxy is used for crawling, RSS feed fetches and webhooks. | +| CRAWLER_HTTPS_PROXY | No | Not set | HTTPS proxy server URL for outgoing HTTPS requests (e.g., `http://proxy.example.com:8080`). You can pass multiple comma separated proxies and the used one will be chosen at random. The proxy is used for crawling, RSS feed fetches and webhooks. | +| CRAWLER_NO_PROXY | No | Not set | Comma-separated list of hostnames/IPs that should bypass the proxy (e.g., `localhost,127.0.0.1,.local`) | +| CRAWLER_ALLOWED_INTERNAL_HOSTNAMES | No | Not set | By default, Karakeep blocks worker-initiated requests whose DNS resolves to private, loopback, or link-local IP addresses. Use this to allowlist specific hostnames for internal access (e.g., `internal.company.com,.local`). Supports domain wildcards by prefixing with a dot (e.g., `.internal.company.com`). Note: Internal IP validation is bypassed when a proxy is configured for the URL as the local DNS resolver won't necessarily be the same as the one used by the proxy. | :::info These proxy settings will be used by the crawler and other components that make outgoing HTTP requests. diff --git a/packages/shared/config.ts b/packages/shared/config.ts index d54b7589..51b591ad 100644 --- a/packages/shared/config.ts +++ b/packages/shared/config.ts @@ -104,6 +104,7 @@ const allEnv = z.object({ .default("") .transform((t) => t.split("%%").filter((a) => a)), CRAWLER_SCREENSHOT_TIMEOUT_SEC: z.coerce.number().default(5), + CRAWLER_IP_VALIDATION_DNS_RESOLVER_TIMEOUT_SEC: z.coerce.number().default(1), LOG_LEVEL: z.string().default("debug"), NO_COLOR: stringBool("false"), DEMO_MODE: stringBool("false"), @@ -178,7 +179,24 @@ const allEnv = z.object({ .filter((p) => p), ) .optional(), - CRAWLER_NO_PROXY: z.string().optional(), + CRAWLER_NO_PROXY: z + .string() + .transform((val) => + val + .split(",") + .map((p) => p.trim()) + .filter((p) => p), + ) + .optional(), + CRAWLER_ALLOWED_INTERNAL_HOSTNAMES: z + .string() + .transform((val) => + val + .split(",") + .map((p) => p.trim()) + .filter((p) => p), + ) + .optional(), // Database configuration DB_WAL_MODE: stringBool("false"), @@ -276,6 +294,10 @@ const serverConfigSchema = allEnv.transform((val, ctx) => { ytDlpArguments: val.CRAWLER_YTDLP_ARGS, screenshotTimeoutSec: val.CRAWLER_SCREENSHOT_TIMEOUT_SEC, htmlContentSizeThreshold: val.HTML_CONTENT_SIZE_INLINE_THRESHOLD_BYTES, + ipValidation: { + dnsResolverTimeoutSec: + val.CRAWLER_IP_VALIDATION_DNS_RESOLVER_TIMEOUT_SEC, + }, }, ocr: { langs: val.OCR_LANGS, @@ -309,6 +331,7 @@ const serverConfigSchema = allEnv.transform((val, ctx) => { httpsProxy: val.CRAWLER_HTTPS_PROXY, noProxy: val.CRAWLER_NO_PROXY, }, + allowedInternalHostnames: val.CRAWLER_ALLOWED_INTERNAL_HOSTNAMES, assetPreprocessing: { numWorkers: val.ASSET_PREPROCESSING_NUM_WORKERS, }, diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 8d068c65..a3ca5ec8 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -826,12 +826,18 @@ importers: https-proxy-agent: specifier: ^7.0.6 version: 7.0.6(supports-color@10.0.0) + ipaddr.js: + specifier: ^2.2.0 + version: 2.2.0 jsdom: specifier: ^24.0.0 version: 24.1.3 liteque: specifier: ^0.6.2 version: 0.6.2(@opentelemetry/api@1.9.0)(@types/better-sqlite3@7.6.13)(@types/react@19.2.2)(better-sqlite3@11.3.0)(kysely@0.28.5)(react@19.1.0) + lru-cache: + specifier: ^11.2.2 + version: 11.2.2 metascraper: specifier: ^5.49.5 version: 5.49.5(postcss@8.5.6) @@ -1797,8 +1803,8 @@ packages: resolution: {integrity: sha512-lJjzvrbEeWrhB4P3QBsH7tey117PjLZnDbLiQEKjQ/fNJTjuq4HSqgFA+UNSwZT8D7dxxbnuSBMsa1lrWzKlQg==} engines: {node: '>=6.9.0'} - '@babel/generator@7.28.3': - resolution: {integrity: sha512-3lSpxGgvnmZznmBkCRnVREPUFJv2wrv9iAoFDvADJc0ypmdOxdUtcLeBgBJ6zE0PMeTKnxeQzyk0xTBq4Ep7zw==} + '@babel/generator@7.28.5': + resolution: {integrity: sha512-3EwLFhZ38J4VyIP6WNtt2kUdW9dokXA9Cr4IVIFHuCpZ3H8/YFOl5JjZHisrn1fATPBmKKqXzDFvh9fUwHz6CQ==} engines: {node: '>=6.9.0'} '@babel/helper-annotate-as-pure@7.27.3': @@ -1876,6 +1882,10 @@ packages: resolution: {integrity: sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow==} engines: {node: '>=6.9.0'} + '@babel/helper-validator-identifier@7.28.5': + resolution: {integrity: sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q==} + engines: {node: '>=6.9.0'} + '@babel/helper-validator-option@7.27.1': resolution: {integrity: sha512-YvjJow9FxbhFFKDSuFnVCe2WxXk1zWc22fFePVNEaWJEu8IrZVlda6N0uHwzZrUM1il7NC9Mlp4MaJYbYd9JSg==} engines: {node: '>=6.9.0'} @@ -1902,8 +1912,8 @@ packages: engines: {node: '>=6.0.0'} hasBin: true - '@babel/parser@7.28.4': - resolution: {integrity: sha512-yZbBqeM6TkpP9du/I2pUZnJsRMGGvOuIrhjzC1AwHwW+6he4mni6Bp/m8ijn0iOuZuPI2BfkCoSRunpyjnrQKg==} + '@babel/parser@7.28.5': + resolution: {integrity: sha512-KKBU1VGYR7ORr3At5HAtUQ+TV3SzRCXmA/8OdDZiLDBIZxVyzXuztPjfLd3BV1PRAQGCMWWSHYhL0F8d5uHBDQ==} engines: {node: '>=6.0.0'} hasBin: true @@ -2488,8 +2498,8 @@ packages: resolution: {integrity: sha512-mGe7UK5wWyh0bKRfupsUchrQGqvDbZDbKJw+kcRGSmdHVYrv+ltd0pnpDTVpiTqnaBru9iEvA8pz8W46v0Amwg==} engines: {node: '>=6.9.0'} - '@babel/traverse@7.28.4': - resolution: {integrity: sha512-YEzuboP2qvQavAcjgQNVgsvHIDv6ZpwXvcvjmyySP2DIMuByS/6ioU5G9pYrWHM6T2YDfc7xga9iNzYOs12CFQ==} + '@babel/traverse@7.28.5': + resolution: {integrity: sha512-TCCj4t55U90khlYkVV/0TfkJkAkUg3jZFA3Neb7unZT8CPok7iiRfaX0F+WnqWqt7OxhOn0uBKXCw4lbL8W0aQ==} engines: {node: '>=6.9.0'} '@babel/types@7.27.6': @@ -2500,8 +2510,8 @@ packages: resolution: {integrity: sha512-x0LvFTekgSX+83TI28Y9wYPUfzrnl2aT5+5QLnO6v7mSJYtEEevuDRN0F0uSHRk1G1IWZC43o00Y0xDDrpBGPQ==} engines: {node: '>=6.9.0'} - '@babel/types@7.28.4': - resolution: {integrity: sha512-bkFqkLhh3pMBUQQkpVgWDWq/lqzc2678eUyDlTBhRqhCHFguYYGM0Efga7tYk4TogG/3x0EEl66/OQ+WGbWB/Q==} + '@babel/types@7.28.5': + resolution: {integrity: sha512-qQ5m48eI/MFLQ5PxQj4PFaprjyCTLI37ElWMmNs0K8Lk3dVeOdNpB3ks8jc7yM5CDmVC73eMVk/trk3fgmrUpA==} engines: {node: '>=6.9.0'} '@colors/colors@1.5.0': @@ -10064,10 +10074,6 @@ packages: lru-cache@10.4.3: resolution: {integrity: sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==} - lru-cache@11.1.0: - resolution: {integrity: sha512-QIXZUBJUx+2zHUdQujWejBkcD9+cs94tLn0+YL8UrCh+D5sCXZ4c7LaEH48pNwRY3MLDgqUFyhlCyjJPf1WP0A==} - engines: {node: 20 || >=22} - lru-cache@11.2.2: resolution: {integrity: sha512-F9ODfyqML2coTIsQpSkRHnLSZMtkU8Q+mSfcaIyKwy58u+8k5nvAYeiNhsyMARvzNcXJ9QfWVrcPsC9e9rAxtg==} engines: {node: 20 || >=22} @@ -12998,6 +13004,27 @@ packages: webpack: optional: true + sass-loader@16.0.6: + resolution: {integrity: sha512-sglGzId5gmlfxNs4gK2U3h7HlVRfx278YK6Ono5lwzuvi1jxig80YiuHkaDBVsYIKFhx8wN7XSCI0M2IDS/3qA==} + engines: {node: '>= 18.12.0'} + peerDependencies: + '@rspack/core': 0.x || 1.x + node-sass: ^4.0.0 || ^5.0.0 || ^6.0.0 || ^7.0.0 || ^8.0.0 || ^9.0.0 + sass: ^1.3.0 + sass-embedded: '*' + webpack: ^5.0.0 + peerDependenciesMeta: + '@rspack/core': + optional: true + node-sass: + optional: true + sass: + optional: true + sass-embedded: + optional: true + webpack: + optional: true + sass@1.89.1: resolution: {integrity: sha512-eMLLkl+qz7tx/0cJ9wI+w09GQ2zodTkcE/aVfywwdlRcI3EO19xGnbmJwg/JMIm+5MxVJ6outddLZ4Von4E++Q==} engines: {node: '>=14.0.0'} @@ -15578,10 +15605,10 @@ snapshots: '@jridgewell/trace-mapping': 0.3.29 jsesc: 3.1.0 - '@babel/generator@7.28.3': + '@babel/generator@7.28.5': dependencies: - '@babel/parser': 7.28.4 - '@babel/types': 7.28.4 + '@babel/parser': 7.28.5 + '@babel/types': 7.28.5 '@jridgewell/gen-mapping': 0.3.13 '@jridgewell/trace-mapping': 0.3.31 jsesc: 3.1.0 @@ -15727,13 +15754,15 @@ snapshots: '@babel/helper-validator-identifier@7.27.1': {} + '@babel/helper-validator-identifier@7.28.5': {} + '@babel/helper-validator-option@7.27.1': {} '@babel/helper-wrap-function@7.27.1': dependencies: '@babel/template': 7.27.2 '@babel/traverse': 7.28.0 - '@babel/types': 7.28.4 + '@babel/types': 7.28.5 transitivePeerDependencies: - supports-color @@ -15757,9 +15786,9 @@ snapshots: dependencies: '@babel/types': 7.28.1 - '@babel/parser@7.28.4': + '@babel/parser@7.28.5': dependencies: - '@babel/types': 7.28.4 + '@babel/types': 7.28.5 '@babel/plugin-bugfix-firefox-class-in-computed-class-key@7.27.1(@babel/core@7.26.0)': dependencies: @@ -16644,14 +16673,14 @@ snapshots: transitivePeerDependencies: - supports-color - '@babel/traverse@7.28.4': + '@babel/traverse@7.28.5': dependencies: '@babel/code-frame': 7.27.1 - '@babel/generator': 7.28.3 + '@babel/generator': 7.28.5 '@babel/helper-globals': 7.28.0 - '@babel/parser': 7.28.4 + '@babel/parser': 7.28.5 '@babel/template': 7.27.2 - '@babel/types': 7.28.4 + '@babel/types': 7.28.5 debug: 4.4.3 transitivePeerDependencies: - supports-color @@ -16666,10 +16695,10 @@ snapshots: '@babel/helper-string-parser': 7.27.1 '@babel/helper-validator-identifier': 7.27.1 - '@babel/types@7.28.4': + '@babel/types@7.28.5': dependencies: '@babel/helper-string-parser': 7.27.1 - '@babel/helper-validator-identifier': 7.27.1 + '@babel/helper-validator-identifier': 7.28.5 '@colors/colors@1.5.0': optional: true @@ -17566,7 +17595,7 @@ snapshots: '@docusaurus/react-loadable@6.0.0(react@19.1.0)': dependencies: - '@types/react': 19.2.2 + '@types/react': 19.1.8 react: 19.1.0 '@docusaurus/theme-classic@3.8.1(@types/react@19.2.2)(acorn@8.15.0)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(typescript@5.8.3)': @@ -21758,7 +21787,7 @@ snapshots: babel-plugin-macros@3.1.0: dependencies: - '@babel/runtime': 7.28.4 + '@babel/runtime': 7.27.6 cosmiconfig: 7.1.0 resolve: 1.22.10 @@ -23045,7 +23074,7 @@ snapshots: dependencies: '@docusaurus/core': 3.8.1(@mdx-js/react@3.1.0(@types/react@19.2.2)(react@19.1.0))(acorn@8.15.0)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(typescript@5.8.3) sass: 1.89.1 - sass-loader: 16.0.5(sass@1.89.1)(webpack@5.99.9) + sass-loader: 16.0.6(sass@1.89.1)(webpack@5.99.9) transitivePeerDependencies: - '@rspack/core' - node-sass @@ -25774,8 +25803,6 @@ snapshots: lru-cache@10.4.3: {} - lru-cache@11.1.0: {} - lru-cache@11.2.2: {} lru-cache@5.1.1: @@ -26418,7 +26445,7 @@ snapshots: metro-source-map@0.82.5: dependencies: '@babel/traverse': 7.28.0 - '@babel/traverse--for-generate-function-map': '@babel/traverse@7.28.4' + '@babel/traverse--for-generate-function-map': '@babel/traverse@7.28.5' '@babel/types': 7.28.1 flow-enums-runtime: 0.0.6 invariant: 2.2.4 @@ -27798,7 +27825,7 @@ snapshots: path-scurry@2.0.0: dependencies: - lru-cache: 11.1.0 + lru-cache: 11.2.2 minipass: 7.1.2 path-to-regexp@0.1.12: {} @@ -29756,6 +29783,13 @@ snapshots: sass: 1.89.1 webpack: 5.99.9 + sass-loader@16.0.6(sass@1.89.1)(webpack@5.99.9): + dependencies: + neo-async: 2.6.2 + optionalDependencies: + sass: 1.89.1 + webpack: 5.99.9 + sass@1.89.1: dependencies: chokidar: 4.0.3