fix: Stricter SSRF validation (#2082)

* fix: Stricter SSRF validation

* skip dns resolution if running in proxy context

* more fixes

* Add LRU cache

* change the env variable for internal hostnames

* make dns resolution timeout configerable

* upgrade ipaddr

* handle ipv6

* handle proxy bypass for request interceptor
This commit is contained in:
Mohamed Bassem 2025-11-02 17:19:28 +00:00 committed by GitHub
parent c6ebceb9f0
commit b63a49fc39
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
10 changed files with 602 additions and 135 deletions

419
apps/workers/network.ts Normal file
View File

@ -0,0 +1,419 @@
import dns from "node:dns/promises";
import type { HeadersInit, RequestInit, Response } from "node-fetch";
import { HttpProxyAgent } from "http-proxy-agent";
import { HttpsProxyAgent } from "https-proxy-agent";
import ipaddr from "ipaddr.js";
import { LRUCache } from "lru-cache";
import fetch, { Headers } from "node-fetch";
import serverConfig from "@karakeep/shared/config";
const DISALLOWED_IP_RANGES = new Set([
// IPv4 ranges
"unspecified",
"broadcast",
"multicast",
"linkLocal",
"loopback",
"private",
"reserved",
"carrierGradeNat",
// IPv6 ranges
"uniqueLocal",
"6to4", // RFC 3056 - IPv6 transition mechanism
"teredo", // RFC 4380 - IPv6 tunneling
"benchmarking", // RFC 5180 - benchmarking addresses
"deprecated", // RFC 3879 - deprecated IPv6 addresses
"discard", // RFC 6666 - discard-only prefix
]);
// DNS cache with 5 minute TTL and max 1000 entries
const dnsCache = new LRUCache<string, string[]>({
max: 1000,
ttl: 5 * 60 * 1000, // 5 minutes in milliseconds
});
async function resolveHostAddresses(hostname: string): Promise<string[]> {
const resolver = new dns.Resolver({
timeout: serverConfig.crawler.ipValidation.dnsResolverTimeoutSec * 1000,
});
const results = await Promise.allSettled([
resolver.resolve4(hostname),
resolver.resolve6(hostname),
]);
const addresses: string[] = [];
const errors: string[] = [];
for (const result of results) {
if (result.status === "fulfilled") {
addresses.push(...result.value);
} else {
const reason = result.reason;
if (reason instanceof Error) {
errors.push(reason.message);
} else {
errors.push(String(reason));
}
}
}
if (addresses.length > 0) {
return addresses;
}
const errorMessage =
errors.length > 0
? errors.join("; ")
: "DNS lookup did not return any A or AAAA records";
throw new Error(errorMessage);
}
function isAddressForbidden(address: string): boolean {
if (!ipaddr.isValid(address)) {
return true;
}
const parsed = ipaddr.parse(address);
if (
parsed.kind() === "ipv6" &&
(parsed as ipaddr.IPv6).isIPv4MappedAddress()
) {
const mapped = (parsed as ipaddr.IPv6).toIPv4Address();
return DISALLOWED_IP_RANGES.has(mapped.range());
}
return DISALLOWED_IP_RANGES.has(parsed.range());
}
export type UrlValidationResult =
| { ok: true; url: URL }
| { ok: false; reason: string };
function hostnameMatchesAnyPattern(
hostname: string,
patterns: string[],
): boolean {
function hostnameMatchesPattern(hostname: string, pattern: string): boolean {
return (
pattern === hostname ||
(pattern.startsWith(".") && hostname.endsWith(pattern)) ||
hostname.endsWith("." + pattern)
);
}
for (const pattern of patterns) {
if (hostnameMatchesPattern(hostname, pattern)) {
return true;
}
}
return false;
}
function isHostnameAllowedForInternalAccess(hostname: string): boolean {
if (!serverConfig.allowedInternalHostnames) {
return false;
}
return hostnameMatchesAnyPattern(
hostname,
serverConfig.allowedInternalHostnames,
);
}
export async function validateUrl(
urlCandidate: string,
runningInProxyContext: boolean,
): Promise<UrlValidationResult> {
let parsedUrl: URL;
try {
parsedUrl = new URL(urlCandidate);
} catch (error) {
return {
ok: false,
reason: `Invalid URL "${urlCandidate}": ${
error instanceof Error ? error.message : String(error)
}`,
} as const;
}
if (parsedUrl.protocol !== "http:" && parsedUrl.protocol !== "https:") {
return {
ok: false,
reason: `Unsupported protocol for URL: ${parsedUrl.toString()}`,
} as const;
}
const hostname = parsedUrl.hostname;
if (!hostname) {
return {
ok: false,
reason: `URL ${parsedUrl.toString()} must include a hostname`,
} as const;
}
if (isHostnameAllowedForInternalAccess(hostname)) {
return { ok: true, url: parsedUrl } as const;
}
if (ipaddr.isValid(hostname)) {
if (isAddressForbidden(hostname)) {
return {
ok: false,
reason: `Refusing to access disallowed IP address ${hostname} (requested via ${parsedUrl.toString()})`,
} as const;
}
return { ok: true, url: parsedUrl } as const;
}
if (runningInProxyContext) {
// If we're running in a proxy context, we must skip DNS resolution
// as the DNS resolution will be handled by the proxy
return { ok: true, url: parsedUrl } as const;
}
// Check cache first
let records = dnsCache.get(hostname);
if (!records) {
// Cache miss or expired - perform DNS resolution
try {
records = await resolveHostAddresses(hostname);
dnsCache.set(hostname, records);
} catch (error) {
return {
ok: false,
reason: `Failed to resolve hostname ${hostname}: ${
error instanceof Error ? error.message : String(error)
}`,
} as const;
}
}
if (!records || records.length === 0) {
return {
ok: false,
reason: `DNS lookup for ${hostname} did not return any addresses (requested via ${parsedUrl.toString()})`,
} as const;
}
for (const record of records) {
if (isAddressForbidden(record)) {
return {
ok: false,
reason: `Refusing to access disallowed resolved address ${record} for host ${hostname}`,
} as const;
}
}
return { ok: true, url: parsedUrl } as const;
}
export function getRandomProxy(proxyList: string[]): string {
return proxyList[Math.floor(Math.random() * proxyList.length)].trim();
}
export function matchesNoProxy(url: string, noProxy: string[]) {
const urlObj = new URL(url);
const hostname = urlObj.hostname;
return hostnameMatchesAnyPattern(hostname, noProxy);
}
export function getProxyAgent(url: string) {
const { proxy } = serverConfig;
if (!proxy.httpProxy && !proxy.httpsProxy) {
return undefined;
}
const urlObj = new URL(url);
const protocol = urlObj.protocol;
// Check if URL should bypass proxy
if (proxy.noProxy && matchesNoProxy(url, proxy.noProxy)) {
return undefined;
}
if (protocol === "https:" && proxy.httpsProxy) {
const selectedProxy = getRandomProxy(proxy.httpsProxy);
return new HttpsProxyAgent(selectedProxy);
} else if (protocol === "http:" && proxy.httpProxy) {
const selectedProxy = getRandomProxy(proxy.httpProxy);
return new HttpProxyAgent(selectedProxy);
} else if (proxy.httpProxy) {
const selectedProxy = getRandomProxy(proxy.httpProxy);
return new HttpProxyAgent(selectedProxy);
}
return undefined;
}
function cloneHeaders(init?: HeadersInit): Headers {
const headers = new Headers();
if (!init) {
return headers;
}
if (init instanceof Headers) {
init.forEach((value, key) => {
headers.set(key, value);
});
return headers;
}
if (Array.isArray(init)) {
for (const [key, value] of init) {
headers.append(key, value);
}
return headers;
}
for (const [key, value] of Object.entries(init)) {
if (Array.isArray(value)) {
headers.set(key, value.join(", "));
} else if (value !== undefined) {
headers.set(key, value);
}
}
return headers;
}
function isRedirectResponse(response: Response): boolean {
return (
response.status === 301 ||
response.status === 302 ||
response.status === 303 ||
response.status === 307 ||
response.status === 308
);
}
export type FetchWithProxyOptions = Omit<
RequestInit & {
maxRedirects?: number;
},
"agent"
>;
interface PreparedFetchOptions {
maxRedirects: number;
baseHeaders: Headers;
method: string;
body?: RequestInit["body"];
baseOptions: RequestInit;
}
export function prepareFetchOptions(
options: FetchWithProxyOptions = {},
): PreparedFetchOptions {
const {
maxRedirects = 5,
headers: initHeaders,
method: initMethod,
body: initBody,
redirect: _ignoredRedirect,
...restOptions
} = options;
const baseOptions = restOptions as RequestInit;
return {
maxRedirects,
baseHeaders: cloneHeaders(initHeaders),
method: initMethod?.toUpperCase?.() ?? "GET",
body: initBody,
baseOptions,
};
}
interface BuildFetchOptionsInput {
method: string;
body?: RequestInit["body"];
headers: Headers;
agent?: RequestInit["agent"];
baseOptions: RequestInit;
}
export function buildFetchOptions({
method,
body,
headers,
agent,
baseOptions,
}: BuildFetchOptionsInput): RequestInit {
return {
...baseOptions,
method,
body,
headers,
agent,
redirect: "manual",
};
}
export const fetchWithProxy = async (
url: string,
options: FetchWithProxyOptions = {},
) => {
const {
maxRedirects,
baseHeaders,
method: preparedMethod,
body: preparedBody,
baseOptions,
} = prepareFetchOptions(options);
let redirectsRemaining = maxRedirects;
let currentUrl = url;
let currentMethod = preparedMethod;
let currentBody = preparedBody;
while (true) {
const agent = getProxyAgent(currentUrl);
const validation = await validateUrl(currentUrl, !!agent);
if (!validation.ok) {
throw new Error(validation.reason);
}
const requestUrl = validation.url;
currentUrl = requestUrl.toString();
const response = await fetch(
currentUrl,
buildFetchOptions({
method: currentMethod,
body: currentBody,
headers: baseHeaders,
agent,
baseOptions,
}),
);
if (!isRedirectResponse(response)) {
return response;
}
const locationHeader = response.headers.get("location");
if (!locationHeader) {
return response;
}
if (redirectsRemaining <= 0) {
throw new Error(`Too many redirects while fetching ${url}`);
}
const nextUrl = new URL(locationHeader, currentUrl);
if (
response.status === 303 ||
((response.status === 301 || response.status === 302) &&
currentMethod !== "GET" &&
currentMethod !== "HEAD")
) {
currentMethod = "GET";
currentBody = undefined;
baseHeaders.delete("content-length");
}
currentUrl = nextUrl.toString();
redirectsRemaining -= 1;
}
};

View File

@ -23,8 +23,10 @@
"hono": "^4.7.10",
"http-proxy-agent": "^7.0.2",
"https-proxy-agent": "^7.0.6",
"ipaddr.js": "^2.2.0",
"jsdom": "^24.0.0",
"liteque": "^0.6.2",
"lru-cache": "^11.2.2",
"metascraper": "^5.49.5",
"metascraper-amazon": "^5.49.5",
"metascraper-author": "^5.49.5",

View File

@ -1,9 +1,3 @@
import { HttpProxyAgent } from "http-proxy-agent";
import { HttpsProxyAgent } from "https-proxy-agent";
import fetch from "node-fetch";
import serverConfig from "@karakeep/shared/config";
export function withTimeout<T, Ret>(
func: (param: T) => Promise<Ret>,
timeoutSec: number,
@ -20,58 +14,3 @@ export function withTimeout<T, Ret>(
]);
};
}
export function getRandomProxy(proxyList: string[]): string {
return proxyList[Math.floor(Math.random() * proxyList.length)].trim();
}
function getProxyAgent(url: string) {
const { proxy } = serverConfig;
if (!proxy.httpProxy && !proxy.httpsProxy) {
return undefined;
}
const urlObj = new URL(url);
const protocol = urlObj.protocol;
// Check if URL should bypass proxy
if (proxy.noProxy) {
const noProxyList = proxy.noProxy.split(",").map((host) => host.trim());
const hostname = urlObj.hostname;
for (const noProxyHost of noProxyList) {
if (
noProxyHost === hostname ||
(noProxyHost.startsWith(".") && hostname.endsWith(noProxyHost)) ||
hostname.endsWith("." + noProxyHost)
) {
return undefined;
}
}
}
if (protocol === "https:" && proxy.httpsProxy) {
const selectedProxy = getRandomProxy(proxy.httpsProxy);
return new HttpsProxyAgent(selectedProxy);
} else if (protocol === "http:" && proxy.httpProxy) {
const selectedProxy = getRandomProxy(proxy.httpProxy);
return new HttpProxyAgent(selectedProxy);
} else if (proxy.httpProxy) {
const selectedProxy = getRandomProxy(proxy.httpProxy);
return new HttpProxyAgent(selectedProxy);
}
return undefined;
}
export const fetchWithProxy = (
url: string,
options: Record<string, unknown> = {},
) => {
const agent = getProxyAgent(url);
if (agent) {
options.agent = agent;
}
return fetch(url, options);
};

View File

@ -25,10 +25,15 @@ import metascraperTitle from "metascraper-title";
import metascraperTwitter from "metascraper-twitter";
import metascraperUrl from "metascraper-url";
import { workerStatsCounter } from "metrics";
import {
fetchWithProxy,
getRandomProxy,
matchesNoProxy,
validateUrl,
} from "network";
import { Browser, BrowserContextOptions } from "playwright";
import { chromium } from "playwright-extra";
import StealthPlugin from "puppeteer-extra-plugin-stealth";
import { fetchWithProxy, getRandomProxy } from "utils";
import { getBookmarkDetails, updateAsset } from "workerUtils";
import { z } from "zod";
@ -173,7 +178,7 @@ function getPlaywrightProxyConfig(): BrowserContextOptions["proxy"] {
server: proxyUrl,
username: parsed.username,
password: parsed.password,
bypass: proxy.noProxy,
bypass: proxy.noProxy?.join(","),
};
}
@ -355,22 +360,6 @@ async function changeBookmarkStatus(
.where(eq(bookmarkLinks.id, bookmarkId));
}
/**
* This provides some "basic" protection from malicious URLs. However, all of those
* can be easily circumvented by pointing dns of origin to localhost, or with
* redirects.
*/
function validateUrl(url: string) {
const urlParsed = new URL(url);
if (urlParsed.protocol != "http:" && urlParsed.protocol != "https:") {
throw new Error(`Unsupported URL protocol: ${urlParsed.protocol}`);
}
if (["localhost", "127.0.0.1", "0.0.0.0"].includes(urlParsed.hostname)) {
throw new Error(`Link hostname rejected: ${urlParsed.hostname}`);
}
}
async function browserlessCrawlPage(
jobId: string,
url: string,
@ -430,11 +419,15 @@ async function crawlPage(
return browserlessCrawlPage(jobId, url, abortSignal);
}
const proxyConfig = getPlaywrightProxyConfig();
const isRunningInProxyContext =
proxyConfig !== undefined &&
!matchesNoProxy(url, proxyConfig.bypass?.split(",") ?? []);
const context = await browser.newContext({
viewport: { width: 1440, height: 900 },
userAgent:
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
proxy: getPlaywrightProxyConfig(),
proxy: proxyConfig,
});
try {
@ -453,8 +446,12 @@ async function crawlPage(
await globalBlocker.enableBlockingInPage(page);
}
// Block audio/video resources
await page.route("**/*", (route) => {
// Block audio/video resources and disallowed sub-requests
await page.route("**/*", async (route) => {
if (abortSignal.aborted) {
await route.abort("aborted");
return;
}
const request = route.request();
const resourceType = request.resourceType();
@ -464,18 +461,49 @@ async function crawlPage(
request.headers()["content-type"]?.includes("video/") ||
request.headers()["content-type"]?.includes("audio/")
) {
route.abort();
await route.abort("aborted");
return;
}
const requestUrl = request.url();
const requestIsRunningInProxyContext =
proxyConfig !== undefined &&
!matchesNoProxy(requestUrl, proxyConfig.bypass?.split(",") ?? []);
if (
requestUrl.startsWith("http://") ||
requestUrl.startsWith("https://")
) {
const validation = await validateUrl(
requestUrl,
requestIsRunningInProxyContext,
);
if (!validation.ok) {
logger.warn(
`[Crawler][${jobId}] Blocking sub-request to disallowed URL "${requestUrl}": ${validation.reason}`,
);
await route.abort("blockedbyclient");
return;
}
}
// Continue with other requests
route.continue();
await route.continue();
});
// Navigate to the target URL
logger.info(`[Crawler][${jobId}] Navigating to "${url}"`);
const navigationValidation = await validateUrl(
url,
isRunningInProxyContext,
);
if (!navigationValidation.ok) {
throw new Error(
`Disallowed navigation target "${url}": ${navigationValidation.reason}`,
);
}
const targetUrl = navigationValidation.url.toString();
logger.info(`[Crawler][${jobId}] Navigating to "${targetUrl}"`);
const response = await Promise.race([
page.goto(url, {
page.goto(targetUrl, {
timeout: serverConfig.crawler.navigateTimeoutSec * 1000,
waitUntil: "domcontentloaded",
}),
@ -483,7 +511,7 @@ async function crawlPage(
]);
logger.info(
`[Crawler][${jobId}] Successfully navigated to "${url}". Waiting for the page to load ...`,
`[Crawler][${jobId}] Successfully navigated to "${targetUrl}". Waiting for the page to load ...`,
);
// Wait until network is relatively idle or timeout after 5 seconds
@ -1231,7 +1259,6 @@ async function runCrawler(job: DequeuedJob<ZCrawlLinkRequest>) {
logger.info(
`[Crawler][${jobId}] Will crawl "${url}" for link with id "${bookmarkId}"`,
);
validateUrl(url);
const contentType = await getContentType(url, jobId, job.abortSignal);
job.abortSignal.throwIfAborted();

View File

@ -1,9 +1,9 @@
import { and, eq, inArray } from "drizzle-orm";
import { workerStatsCounter } from "metrics";
import { fetchWithProxy } from "network";
import cron from "node-cron";
import Parser from "rss-parser";
import { buildImpersonatingTRPCClient } from "trpc";
import { fetchWithProxy } from "utils";
import { z } from "zod";
import type { ZFeedRequestSchema } from "@karakeep/shared-server";

View File

@ -3,6 +3,7 @@ import * as os from "os";
import path from "path";
import { execa } from "execa";
import { workerStatsCounter } from "metrics";
import { getProxyAgent, validateUrl } from "network";
import { db } from "@karakeep/db";
import { AssetTypes } from "@karakeep/db/schema";
@ -62,7 +63,11 @@ export class VideoWorker {
}
}
function prepareYtDlpArguments(url: string, assetPath: string) {
function prepareYtDlpArguments(
url: string,
proxy: string | undefined,
assetPath: string,
) {
const ytDlpArguments = [url];
if (serverConfig.crawler.maxVideoDownloadSize > 0) {
ytDlpArguments.push(
@ -74,6 +79,9 @@ function prepareYtDlpArguments(url: string, assetPath: string) {
ytDlpArguments.push(...serverConfig.crawler.ytDlpArguments);
ytDlpArguments.push("-o", assetPath);
ytDlpArguments.push("--no-playlist");
if (proxy) {
ytDlpArguments.push("--proxy", proxy);
}
return ytDlpArguments;
}
@ -94,15 +102,29 @@ async function runWorker(job: DequeuedJob<ZVideoRequest>) {
return;
}
const proxy = getProxyAgent(url);
const validation = await validateUrl(url, !!proxy);
if (!validation.ok) {
logger.warn(
`[VideoCrawler][${jobId}] Skipping video download to disallowed URL "${url}": ${validation.reason}`,
);
return;
}
const normalizedUrl = validation.url.toString();
const videoAssetId = newAssetId();
let assetPath = `${TMP_FOLDER}/${videoAssetId}`;
await fs.promises.mkdir(TMP_FOLDER, { recursive: true });
const ytDlpArguments = prepareYtDlpArguments(url, assetPath);
const ytDlpArguments = prepareYtDlpArguments(
normalizedUrl,
proxy?.proxy.toString(),
assetPath,
);
try {
logger.info(
`[VideoCrawler][${jobId}] Attempting to download a file from "${url}" to "${assetPath}" using the following arguments: "${ytDlpArguments}"`,
`[VideoCrawler][${jobId}] Attempting to download a file from "${normalizedUrl}" to "${assetPath}" using the following arguments: "${ytDlpArguments}"`,
);
await execa("yt-dlp", ytDlpArguments, {
@ -123,11 +145,11 @@ async function runWorker(job: DequeuedJob<ZVideoRequest>) {
err.message.includes("No media found")
) {
logger.info(
`[VideoCrawler][${jobId}] Skipping video download from "${url}", because it's not one of the supported yt-dlp URLs`,
`[VideoCrawler][${jobId}] Skipping video download from "${normalizedUrl}", because it's not one of the supported yt-dlp URLs`,
);
return;
}
const genericError = `[VideoCrawler][${jobId}] Failed to download a file from "${url}" to "${assetPath}"`;
const genericError = `[VideoCrawler][${jobId}] Failed to download a file from "${normalizedUrl}" to "${assetPath}"`;
if ("stderr" in err) {
logger.error(`${genericError}: ${err.stderr}`);
} else {
@ -138,7 +160,7 @@ async function runWorker(job: DequeuedJob<ZVideoRequest>) {
}
logger.info(
`[VideoCrawler][${jobId}] Finished downloading a file from "${url}" to "${assetPath}"`,
`[VideoCrawler][${jobId}] Finished downloading a file from "${normalizedUrl}" to "${assetPath}"`,
);
// Get file size and check quota before saving
@ -177,7 +199,7 @@ async function runWorker(job: DequeuedJob<ZVideoRequest>) {
await silentDeleteAsset(userId, oldVideoAssetId);
logger.info(
`[VideoCrawler][${jobId}] Finished downloading video from "${url}" and adding it to the database`,
`[VideoCrawler][${jobId}] Finished downloading video from "${normalizedUrl}" and adding it to the database`,
);
} catch (error) {
if (error instanceof StorageQuotaError) {

View File

@ -1,6 +1,6 @@
import { eq } from "drizzle-orm";
import { workerStatsCounter } from "metrics";
import fetch from "node-fetch";
import { fetchWithProxy } from "network";
import { db } from "@karakeep/db";
import { bookmarks, webhooksTable } from "@karakeep/db/schema";
@ -102,7 +102,7 @@ async function runWebhook(job: DequeuedJob<ZWebhookRequest>) {
while (attempt < maxRetries && !success) {
try {
const response = await fetch(url, {
const response = await fetchWithProxy(url, {
method: "POST",
headers: {
"Content-Type": "application/json",

View File

@ -222,11 +222,12 @@ Karakeep can send emails for various purposes such as email verification during
If your Karakeep instance needs to connect through a proxy server, you can configure the following settings:
| Name | Required | Default | Description |
| ------------------- | -------- | ------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
| CRAWLER_HTTP_PROXY | No | Not set | HTTP proxy server URL for outgoing HTTP requests (e.g., `http://proxy.example.com:8080`). You can pass multiple comma separated proxies and the used one will be chosen at random. |
| CRAWLER_HTTPS_PROXY | No | Not set | HTTPS proxy server URL for outgoing HTTPS requests (e.g., `http://proxy.example.com:8080`). You can pass multiple comma separated proxies and the used one will be chosen at random. |
| CRAWLER_NO_PROXY | No | Not set | Comma-separated list of hostnames/IPs that should bypass the proxy (e.g., `localhost,127.0.0.1,.local`) |
| Name | Required | Default | Description |
| ---------------------------------- | -------- | ------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| CRAWLER_HTTP_PROXY | No | Not set | HTTP proxy server URL for outgoing HTTP requests (e.g., `http://proxy.example.com:8080`). You can pass multiple comma separated proxies and the used one will be chosen at random. The proxy is used for crawling, RSS feed fetches and webhooks. |
| CRAWLER_HTTPS_PROXY | No | Not set | HTTPS proxy server URL for outgoing HTTPS requests (e.g., `http://proxy.example.com:8080`). You can pass multiple comma separated proxies and the used one will be chosen at random. The proxy is used for crawling, RSS feed fetches and webhooks. |
| CRAWLER_NO_PROXY | No | Not set | Comma-separated list of hostnames/IPs that should bypass the proxy (e.g., `localhost,127.0.0.1,.local`) |
| CRAWLER_ALLOWED_INTERNAL_HOSTNAMES | No | Not set | By default, Karakeep blocks worker-initiated requests whose DNS resolves to private, loopback, or link-local IP addresses. Use this to allowlist specific hostnames for internal access (e.g., `internal.company.com,.local`). Supports domain wildcards by prefixing with a dot (e.g., `.internal.company.com`). Note: Internal IP validation is bypassed when a proxy is configured for the URL as the local DNS resolver won't necessarily be the same as the one used by the proxy. |
:::info
These proxy settings will be used by the crawler and other components that make outgoing HTTP requests.

View File

@ -104,6 +104,7 @@ const allEnv = z.object({
.default("")
.transform((t) => t.split("%%").filter((a) => a)),
CRAWLER_SCREENSHOT_TIMEOUT_SEC: z.coerce.number().default(5),
CRAWLER_IP_VALIDATION_DNS_RESOLVER_TIMEOUT_SEC: z.coerce.number().default(1),
LOG_LEVEL: z.string().default("debug"),
NO_COLOR: stringBool("false"),
DEMO_MODE: stringBool("false"),
@ -178,7 +179,24 @@ const allEnv = z.object({
.filter((p) => p),
)
.optional(),
CRAWLER_NO_PROXY: z.string().optional(),
CRAWLER_NO_PROXY: z
.string()
.transform((val) =>
val
.split(",")
.map((p) => p.trim())
.filter((p) => p),
)
.optional(),
CRAWLER_ALLOWED_INTERNAL_HOSTNAMES: z
.string()
.transform((val) =>
val
.split(",")
.map((p) => p.trim())
.filter((p) => p),
)
.optional(),
// Database configuration
DB_WAL_MODE: stringBool("false"),
@ -276,6 +294,10 @@ const serverConfigSchema = allEnv.transform((val, ctx) => {
ytDlpArguments: val.CRAWLER_YTDLP_ARGS,
screenshotTimeoutSec: val.CRAWLER_SCREENSHOT_TIMEOUT_SEC,
htmlContentSizeThreshold: val.HTML_CONTENT_SIZE_INLINE_THRESHOLD_BYTES,
ipValidation: {
dnsResolverTimeoutSec:
val.CRAWLER_IP_VALIDATION_DNS_RESOLVER_TIMEOUT_SEC,
},
},
ocr: {
langs: val.OCR_LANGS,
@ -309,6 +331,7 @@ const serverConfigSchema = allEnv.transform((val, ctx) => {
httpsProxy: val.CRAWLER_HTTPS_PROXY,
noProxy: val.CRAWLER_NO_PROXY,
},
allowedInternalHostnames: val.CRAWLER_ALLOWED_INTERNAL_HOSTNAMES,
assetPreprocessing: {
numWorkers: val.ASSET_PREPROCESSING_NUM_WORKERS,
},

96
pnpm-lock.yaml generated
View File

@ -826,12 +826,18 @@ importers:
https-proxy-agent:
specifier: ^7.0.6
version: 7.0.6(supports-color@10.0.0)
ipaddr.js:
specifier: ^2.2.0
version: 2.2.0
jsdom:
specifier: ^24.0.0
version: 24.1.3
liteque:
specifier: ^0.6.2
version: 0.6.2(@opentelemetry/api@1.9.0)(@types/better-sqlite3@7.6.13)(@types/react@19.2.2)(better-sqlite3@11.3.0)(kysely@0.28.5)(react@19.1.0)
lru-cache:
specifier: ^11.2.2
version: 11.2.2
metascraper:
specifier: ^5.49.5
version: 5.49.5(postcss@8.5.6)
@ -1797,8 +1803,8 @@ packages:
resolution: {integrity: sha512-lJjzvrbEeWrhB4P3QBsH7tey117PjLZnDbLiQEKjQ/fNJTjuq4HSqgFA+UNSwZT8D7dxxbnuSBMsa1lrWzKlQg==}
engines: {node: '>=6.9.0'}
'@babel/generator@7.28.3':
resolution: {integrity: sha512-3lSpxGgvnmZznmBkCRnVREPUFJv2wrv9iAoFDvADJc0ypmdOxdUtcLeBgBJ6zE0PMeTKnxeQzyk0xTBq4Ep7zw==}
'@babel/generator@7.28.5':
resolution: {integrity: sha512-3EwLFhZ38J4VyIP6WNtt2kUdW9dokXA9Cr4IVIFHuCpZ3H8/YFOl5JjZHisrn1fATPBmKKqXzDFvh9fUwHz6CQ==}
engines: {node: '>=6.9.0'}
'@babel/helper-annotate-as-pure@7.27.3':
@ -1876,6 +1882,10 @@ packages:
resolution: {integrity: sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow==}
engines: {node: '>=6.9.0'}
'@babel/helper-validator-identifier@7.28.5':
resolution: {integrity: sha512-qSs4ifwzKJSV39ucNjsvc6WVHs6b7S03sOh2OcHF9UHfVPqWWALUsNUVzhSBiItjRZoLHx7nIarVjqKVusUZ1Q==}
engines: {node: '>=6.9.0'}
'@babel/helper-validator-option@7.27.1':
resolution: {integrity: sha512-YvjJow9FxbhFFKDSuFnVCe2WxXk1zWc22fFePVNEaWJEu8IrZVlda6N0uHwzZrUM1il7NC9Mlp4MaJYbYd9JSg==}
engines: {node: '>=6.9.0'}
@ -1902,8 +1912,8 @@ packages:
engines: {node: '>=6.0.0'}
hasBin: true
'@babel/parser@7.28.4':
resolution: {integrity: sha512-yZbBqeM6TkpP9du/I2pUZnJsRMGGvOuIrhjzC1AwHwW+6he4mni6Bp/m8ijn0iOuZuPI2BfkCoSRunpyjnrQKg==}
'@babel/parser@7.28.5':
resolution: {integrity: sha512-KKBU1VGYR7ORr3At5HAtUQ+TV3SzRCXmA/8OdDZiLDBIZxVyzXuztPjfLd3BV1PRAQGCMWWSHYhL0F8d5uHBDQ==}
engines: {node: '>=6.0.0'}
hasBin: true
@ -2488,8 +2498,8 @@ packages:
resolution: {integrity: sha512-mGe7UK5wWyh0bKRfupsUchrQGqvDbZDbKJw+kcRGSmdHVYrv+ltd0pnpDTVpiTqnaBru9iEvA8pz8W46v0Amwg==}
engines: {node: '>=6.9.0'}
'@babel/traverse@7.28.4':
resolution: {integrity: sha512-YEzuboP2qvQavAcjgQNVgsvHIDv6ZpwXvcvjmyySP2DIMuByS/6ioU5G9pYrWHM6T2YDfc7xga9iNzYOs12CFQ==}
'@babel/traverse@7.28.5':
resolution: {integrity: sha512-TCCj4t55U90khlYkVV/0TfkJkAkUg3jZFA3Neb7unZT8CPok7iiRfaX0F+WnqWqt7OxhOn0uBKXCw4lbL8W0aQ==}
engines: {node: '>=6.9.0'}
'@babel/types@7.27.6':
@ -2500,8 +2510,8 @@ packages:
resolution: {integrity: sha512-x0LvFTekgSX+83TI28Y9wYPUfzrnl2aT5+5QLnO6v7mSJYtEEevuDRN0F0uSHRk1G1IWZC43o00Y0xDDrpBGPQ==}
engines: {node: '>=6.9.0'}
'@babel/types@7.28.4':
resolution: {integrity: sha512-bkFqkLhh3pMBUQQkpVgWDWq/lqzc2678eUyDlTBhRqhCHFguYYGM0Efga7tYk4TogG/3x0EEl66/OQ+WGbWB/Q==}
'@babel/types@7.28.5':
resolution: {integrity: sha512-qQ5m48eI/MFLQ5PxQj4PFaprjyCTLI37ElWMmNs0K8Lk3dVeOdNpB3ks8jc7yM5CDmVC73eMVk/trk3fgmrUpA==}
engines: {node: '>=6.9.0'}
'@colors/colors@1.5.0':
@ -10064,10 +10074,6 @@ packages:
lru-cache@10.4.3:
resolution: {integrity: sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==}
lru-cache@11.1.0:
resolution: {integrity: sha512-QIXZUBJUx+2zHUdQujWejBkcD9+cs94tLn0+YL8UrCh+D5sCXZ4c7LaEH48pNwRY3MLDgqUFyhlCyjJPf1WP0A==}
engines: {node: 20 || >=22}
lru-cache@11.2.2:
resolution: {integrity: sha512-F9ODfyqML2coTIsQpSkRHnLSZMtkU8Q+mSfcaIyKwy58u+8k5nvAYeiNhsyMARvzNcXJ9QfWVrcPsC9e9rAxtg==}
engines: {node: 20 || >=22}
@ -12998,6 +13004,27 @@ packages:
webpack:
optional: true
sass-loader@16.0.6:
resolution: {integrity: sha512-sglGzId5gmlfxNs4gK2U3h7HlVRfx278YK6Ono5lwzuvi1jxig80YiuHkaDBVsYIKFhx8wN7XSCI0M2IDS/3qA==}
engines: {node: '>= 18.12.0'}
peerDependencies:
'@rspack/core': 0.x || 1.x
node-sass: ^4.0.0 || ^5.0.0 || ^6.0.0 || ^7.0.0 || ^8.0.0 || ^9.0.0
sass: ^1.3.0
sass-embedded: '*'
webpack: ^5.0.0
peerDependenciesMeta:
'@rspack/core':
optional: true
node-sass:
optional: true
sass:
optional: true
sass-embedded:
optional: true
webpack:
optional: true
sass@1.89.1:
resolution: {integrity: sha512-eMLLkl+qz7tx/0cJ9wI+w09GQ2zodTkcE/aVfywwdlRcI3EO19xGnbmJwg/JMIm+5MxVJ6outddLZ4Von4E++Q==}
engines: {node: '>=14.0.0'}
@ -15578,10 +15605,10 @@ snapshots:
'@jridgewell/trace-mapping': 0.3.29
jsesc: 3.1.0
'@babel/generator@7.28.3':
'@babel/generator@7.28.5':
dependencies:
'@babel/parser': 7.28.4
'@babel/types': 7.28.4
'@babel/parser': 7.28.5
'@babel/types': 7.28.5
'@jridgewell/gen-mapping': 0.3.13
'@jridgewell/trace-mapping': 0.3.31
jsesc: 3.1.0
@ -15727,13 +15754,15 @@ snapshots:
'@babel/helper-validator-identifier@7.27.1': {}
'@babel/helper-validator-identifier@7.28.5': {}
'@babel/helper-validator-option@7.27.1': {}
'@babel/helper-wrap-function@7.27.1':
dependencies:
'@babel/template': 7.27.2
'@babel/traverse': 7.28.0
'@babel/types': 7.28.4
'@babel/types': 7.28.5
transitivePeerDependencies:
- supports-color
@ -15757,9 +15786,9 @@ snapshots:
dependencies:
'@babel/types': 7.28.1
'@babel/parser@7.28.4':
'@babel/parser@7.28.5':
dependencies:
'@babel/types': 7.28.4
'@babel/types': 7.28.5
'@babel/plugin-bugfix-firefox-class-in-computed-class-key@7.27.1(@babel/core@7.26.0)':
dependencies:
@ -16644,14 +16673,14 @@ snapshots:
transitivePeerDependencies:
- supports-color
'@babel/traverse@7.28.4':
'@babel/traverse@7.28.5':
dependencies:
'@babel/code-frame': 7.27.1
'@babel/generator': 7.28.3
'@babel/generator': 7.28.5
'@babel/helper-globals': 7.28.0
'@babel/parser': 7.28.4
'@babel/parser': 7.28.5
'@babel/template': 7.27.2
'@babel/types': 7.28.4
'@babel/types': 7.28.5
debug: 4.4.3
transitivePeerDependencies:
- supports-color
@ -16666,10 +16695,10 @@ snapshots:
'@babel/helper-string-parser': 7.27.1
'@babel/helper-validator-identifier': 7.27.1
'@babel/types@7.28.4':
'@babel/types@7.28.5':
dependencies:
'@babel/helper-string-parser': 7.27.1
'@babel/helper-validator-identifier': 7.27.1
'@babel/helper-validator-identifier': 7.28.5
'@colors/colors@1.5.0':
optional: true
@ -17566,7 +17595,7 @@ snapshots:
'@docusaurus/react-loadable@6.0.0(react@19.1.0)':
dependencies:
'@types/react': 19.2.2
'@types/react': 19.1.8
react: 19.1.0
'@docusaurus/theme-classic@3.8.1(@types/react@19.2.2)(acorn@8.15.0)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(typescript@5.8.3)':
@ -21758,7 +21787,7 @@ snapshots:
babel-plugin-macros@3.1.0:
dependencies:
'@babel/runtime': 7.28.4
'@babel/runtime': 7.27.6
cosmiconfig: 7.1.0
resolve: 1.22.10
@ -23045,7 +23074,7 @@ snapshots:
dependencies:
'@docusaurus/core': 3.8.1(@mdx-js/react@3.1.0(@types/react@19.2.2)(react@19.1.0))(acorn@8.15.0)(react-dom@19.1.0(react@19.1.0))(react@19.1.0)(typescript@5.8.3)
sass: 1.89.1
sass-loader: 16.0.5(sass@1.89.1)(webpack@5.99.9)
sass-loader: 16.0.6(sass@1.89.1)(webpack@5.99.9)
transitivePeerDependencies:
- '@rspack/core'
- node-sass
@ -25774,8 +25803,6 @@ snapshots:
lru-cache@10.4.3: {}
lru-cache@11.1.0: {}
lru-cache@11.2.2: {}
lru-cache@5.1.1:
@ -26418,7 +26445,7 @@ snapshots:
metro-source-map@0.82.5:
dependencies:
'@babel/traverse': 7.28.0
'@babel/traverse--for-generate-function-map': '@babel/traverse@7.28.4'
'@babel/traverse--for-generate-function-map': '@babel/traverse@7.28.5'
'@babel/types': 7.28.1
flow-enums-runtime: 0.0.6
invariant: 2.2.4
@ -27798,7 +27825,7 @@ snapshots:
path-scurry@2.0.0:
dependencies:
lru-cache: 11.1.0
lru-cache: 11.2.2
minipass: 7.1.2
path-to-regexp@0.1.12: {}
@ -29756,6 +29783,13 @@ snapshots:
sass: 1.89.1
webpack: 5.99.9
sass-loader@16.0.6(sass@1.89.1)(webpack@5.99.9):
dependencies:
neo-async: 2.6.2
optionalDependencies:
sass: 1.89.1
webpack: 5.99.9
sass@1.89.1:
dependencies:
chokidar: 4.0.3