feat(workers): migrate from puppeteer to playwright (#1296)

* feat: convert to playwright

Convert crawling to use Playwright instead of Chrome.

- Update Dockerfile to include Playwright
- Update crawler worker to use Playwright API
- Update dependencies

* feat: convert from Puppeteer to Playwright for crawling

* feat: update docker-compose

* use separate browser context for better isolation

* skip chrome download in linux script

* readd the stealth plugin

---------

Co-authored-by: Mohamed Bassem <me@mbassem.com>
This commit is contained in:
Mael 2025-06-22 19:08:21 +02:00 committed by GitHub
parent 727c7f2270
commit c70d64d4cd
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 151 additions and 539 deletions

View File

@ -4,7 +4,7 @@
"version": "0.1.0",
"private": true,
"dependencies": {
"@ghostery/adblocker-puppeteer": "^2.5.1",
"@ghostery/adblocker-playwright": "^2.5.1",
"@karakeep/db": "workspace:^0.1.0",
"@karakeep/shared": "workspace:^0.1.0",
"@karakeep/trpc": "workspace:^0.1.0",
@ -36,8 +36,8 @@
"pdf2json": "^3.1.5",
"pdf2pic": "^3.1.3",
"pdfjs-dist": "^4.2.67",
"puppeteer": "^22.0.0",
"puppeteer-extra": "^3.3.6",
"playwright": "^1.42.1",
"playwright-extra": "^4.3.6",
"puppeteer-extra-plugin-stealth": "^2.11.2",
"rss-parser": "^3.13.0",
"tesseract.js": "^5.1.1",

View File

@ -2,8 +2,7 @@ import * as dns from "dns";
import { promises as fs } from "fs";
import * as path from "node:path";
import * as os from "os";
import type { Browser } from "puppeteer";
import { PuppeteerBlocker } from "@ghostery/adblocker-puppeteer";
import { PlaywrightBlocker } from "@ghostery/adblocker-playwright";
import { Readability } from "@mozilla/readability";
import { Mutex } from "async-mutex";
import DOMPurify from "dompurify";
@ -25,7 +24,8 @@ import metascraperTitle from "metascraper-title";
import metascraperTwitter from "metascraper-twitter";
import metascraperUrl from "metascraper-url";
import fetch from "node-fetch";
import puppeteer from "puppeteer-extra";
import { Browser } from "playwright";
import { chromium } from "playwright-extra";
import StealthPlugin from "puppeteer-extra-plugin-stealth";
import { withTimeout } from "utils";
import { getBookmarkDetails, updateAsset } from "workerUtils";
@ -81,38 +81,37 @@ const metascraperParser = metascraper([
]);
let globalBrowser: Browser | undefined;
let globalBlocker: PuppeteerBlocker | undefined;
let globalBlocker: PlaywrightBlocker | undefined;
// Guards the interactions with the browser instance.
// This is needed given that most of the browser APIs are async.
const browserMutex = new Mutex();
async function startBrowserInstance() {
const defaultViewport = {
width: 1440,
height: 900,
};
if (serverConfig.crawler.browserWebSocketUrl) {
logger.info(
`[Crawler] Connecting to existing browser websocket address: ${serverConfig.crawler.browserWebSocketUrl}`,
);
return puppeteer.connect({
browserWSEndpoint: serverConfig.crawler.browserWebSocketUrl,
defaultViewport,
return await chromium.connect(serverConfig.crawler.browserWebSocketUrl, {
// Important: using slowMo to ensure stability with remote browser
slowMo: 100,
timeout: 5000,
});
} else if (serverConfig.crawler.browserWebUrl) {
logger.info(
`[Crawler] Connecting to existing browser instance: ${serverConfig.crawler.browserWebUrl}`,
);
const webUrl = new URL(serverConfig.crawler.browserWebUrl);
// We need to resolve the ip address as a workaround for https://github.com/puppeteer/puppeteer/issues/2242
const { address } = await dns.promises.lookup(webUrl.hostname);
webUrl.hostname = address;
logger.info(
`[Crawler] Successfully resolved IP address, new address: ${webUrl.toString()}`,
);
return puppeteer.connect({
browserURL: webUrl.toString(),
defaultViewport,
return await chromium.connectOverCDP(webUrl.toString(), {
// Important: using slowMo to ensure stability with remote browser
slowMo: 100,
timeout: 5000,
});
} else {
logger.info(`Running in browserless mode`);
@ -141,12 +140,12 @@ async function launchBrowser() {
globalBrowser?.on("disconnected", () => {
if (isShuttingDown) {
logger.info(
"[Crawler] The puppeteer browser got disconnected. But we're shutting down so won't restart it.",
"[Crawler] The Playwright browser got disconnected. But we're shutting down so won't restart it.",
);
return;
}
logger.info(
"[Crawler] The puppeteer browser got disconnected. Will attempt to launch it again.",
"[Crawler] The Playwright browser got disconnected. Will attempt to launch it again.",
);
launchBrowser();
});
@ -155,11 +154,11 @@ async function launchBrowser() {
export class CrawlerWorker {
static async build() {
puppeteer.use(StealthPlugin());
chromium.use(StealthPlugin());
if (serverConfig.crawler.enableAdblocker) {
try {
logger.info("[crawler] Loading adblocker ...");
globalBlocker = await PuppeteerBlocker.fromPrebuiltFull(fetch, {
globalBlocker = await PlaywrightBlocker.fromPrebuiltFull(fetch, {
path: path.join(os.tmpdir(), "karakeep_adblocker.bin"),
read: fs.readFile,
write: fs.writeFile,
@ -287,39 +286,45 @@ async function crawlPage(
if (!browser) {
return browserlessCrawlPage(jobId, url, abortSignal);
}
const context = await browser.createBrowserContext();
const context = await browser.newContext({
viewport: { width: 1440, height: 900 },
userAgent:
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
});
try {
// Create a new page in the context
const page = await context.newPage();
// Apply ad blocking
if (globalBlocker) {
await globalBlocker.enableBlockingInPage(page);
}
await page.setUserAgent(
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
);
// Navigate to the target URL
logger.info(`[Crawler][${jobId}] Navigating to "${url}"`);
const response = await page.goto(url, {
timeout: serverConfig.crawler.navigateTimeoutSec * 1000,
waitUntil: "domcontentloaded",
});
logger.info(
`[Crawler][${jobId}] Successfully navigated to "${url}". Waiting for the page to load ...`,
);
// Wait until there's at most two connections for 2 seconds
// Attempt to wait only for 5 seconds
// Wait until network is relatively idle or timeout after 5 seconds
await Promise.race([
page.waitForNetworkIdle({
idleTime: 1000, // 1 sec
concurrency: 2,
}),
new Promise((f) => setTimeout(f, 5000)),
page.waitForLoadState("networkidle", { timeout: 5000 }).catch(() => ({})),
new Promise((resolve) => setTimeout(resolve, 5000)),
]);
logger.info(`[Crawler][${jobId}] Finished waiting for the page to load.`);
// Extract content from the page
const htmlContent = await page.content();
logger.info(`[Crawler][${jobId}] Successfully fetched the page content.`);
// Take a screenshot if configured
let screenshot: Buffer | undefined = undefined;
if (serverConfig.crawler.storeScreenshot) {
try {
@ -327,7 +332,6 @@ async function crawlPage(
page.screenshot({
// If you change this, you need to change the asset type in the store function.
type: "png",
encoding: "binary",
fullPage: serverConfig.crawler.fullPageScreenshot,
}),
new Promise((_, reject) =>
@ -358,6 +362,7 @@ async function crawlPage(
};
} finally {
await context.close();
// Only close the browser if it was created on demand
if (serverConfig.crawler.browserConnectOnDemand) {
await browser.close();
}

View File

@ -13,7 +13,7 @@ RUN apk add --no-cache libc6-compat make g++ py3-pip linux-headers
COPY . .
ENV NEXT_TELEMETRY_DISABLED 1
ENV PUPPETEER_SKIP_DOWNLOAD true
ENV PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1
RUN pnpm install --frozen-lockfile
# Build the db migration script

View File

@ -11,4 +11,4 @@ RUN corepack enable
COPY . .
ENV NEXT_TELEMETRY_DISABLED 1
ENV PUPPETEER_SKIP_DOWNLOAD true
ENV PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1

View File

@ -20,6 +20,8 @@ services:
chrome:
image: gcr.io/zenika-hub/alpine-chrome:123
restart: unless-stopped
ports:
- 9222:9222
command:
- --no-sandbox
- --disable-gpu

View File

@ -223,7 +223,7 @@ install_karakeep() {
mv karakeep-"$RELEASE" "$INSTALL_DIR" && cd "$APP_DIR"/web
corepack enable
export NEXT_TELEMETRY_DISABLED=1
export PUPPETEER_SKIP_DOWNLOAD="true"
export PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD="true"
export CI="true"
$shh pnpm i --frozen-lockfile
$shh pnpm build
@ -433,7 +433,7 @@ update_karakeep() {
fi
corepack enable
export NEXT_TELEMETRY_DISABLED=1
export PUPPETEER_SKIP_DOWNLOAD="true"
export PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD="true"
export CI="true"
cd "$APP_DIR"/web && $shh pnpm i --frozen-lockfile
$shh pnpm build

602
pnpm-lock.yaml generated

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,5 @@
{
"name": "oxlint",
"version": "0.0.0",
"private": true
}