mirror of
https://github.com/karakeep-app/karakeep.git
synced 2026-01-09 07:44:58 +08:00
feat(workers): migrate from puppeteer to playwright (#1296)
* feat: convert to playwright Convert crawling to use Playwright instead of Chrome. - Update Dockerfile to include Playwright - Update crawler worker to use Playwright API - Update dependencies * feat: convert from Puppeteer to Playwright for crawling * feat: update docker-compose * use separate browser context for better isolation * skip chrome download in linux script * readd the stealth plugin --------- Co-authored-by: Mohamed Bassem <me@mbassem.com>
This commit is contained in:
parent
727c7f2270
commit
c70d64d4cd
@ -4,7 +4,7 @@
|
||||
"version": "0.1.0",
|
||||
"private": true,
|
||||
"dependencies": {
|
||||
"@ghostery/adblocker-puppeteer": "^2.5.1",
|
||||
"@ghostery/adblocker-playwright": "^2.5.1",
|
||||
"@karakeep/db": "workspace:^0.1.0",
|
||||
"@karakeep/shared": "workspace:^0.1.0",
|
||||
"@karakeep/trpc": "workspace:^0.1.0",
|
||||
@ -36,8 +36,8 @@
|
||||
"pdf2json": "^3.1.5",
|
||||
"pdf2pic": "^3.1.3",
|
||||
"pdfjs-dist": "^4.2.67",
|
||||
"puppeteer": "^22.0.0",
|
||||
"puppeteer-extra": "^3.3.6",
|
||||
"playwright": "^1.42.1",
|
||||
"playwright-extra": "^4.3.6",
|
||||
"puppeteer-extra-plugin-stealth": "^2.11.2",
|
||||
"rss-parser": "^3.13.0",
|
||||
"tesseract.js": "^5.1.1",
|
||||
|
||||
@ -2,8 +2,7 @@ import * as dns from "dns";
|
||||
import { promises as fs } from "fs";
|
||||
import * as path from "node:path";
|
||||
import * as os from "os";
|
||||
import type { Browser } from "puppeteer";
|
||||
import { PuppeteerBlocker } from "@ghostery/adblocker-puppeteer";
|
||||
import { PlaywrightBlocker } from "@ghostery/adblocker-playwright";
|
||||
import { Readability } from "@mozilla/readability";
|
||||
import { Mutex } from "async-mutex";
|
||||
import DOMPurify from "dompurify";
|
||||
@ -25,7 +24,8 @@ import metascraperTitle from "metascraper-title";
|
||||
import metascraperTwitter from "metascraper-twitter";
|
||||
import metascraperUrl from "metascraper-url";
|
||||
import fetch from "node-fetch";
|
||||
import puppeteer from "puppeteer-extra";
|
||||
import { Browser } from "playwright";
|
||||
import { chromium } from "playwright-extra";
|
||||
import StealthPlugin from "puppeteer-extra-plugin-stealth";
|
||||
import { withTimeout } from "utils";
|
||||
import { getBookmarkDetails, updateAsset } from "workerUtils";
|
||||
@ -81,38 +81,37 @@ const metascraperParser = metascraper([
|
||||
]);
|
||||
|
||||
let globalBrowser: Browser | undefined;
|
||||
let globalBlocker: PuppeteerBlocker | undefined;
|
||||
let globalBlocker: PlaywrightBlocker | undefined;
|
||||
// Guards the interactions with the browser instance.
|
||||
// This is needed given that most of the browser APIs are async.
|
||||
const browserMutex = new Mutex();
|
||||
|
||||
async function startBrowserInstance() {
|
||||
const defaultViewport = {
|
||||
width: 1440,
|
||||
height: 900,
|
||||
};
|
||||
if (serverConfig.crawler.browserWebSocketUrl) {
|
||||
logger.info(
|
||||
`[Crawler] Connecting to existing browser websocket address: ${serverConfig.crawler.browserWebSocketUrl}`,
|
||||
);
|
||||
return puppeteer.connect({
|
||||
browserWSEndpoint: serverConfig.crawler.browserWebSocketUrl,
|
||||
defaultViewport,
|
||||
return await chromium.connect(serverConfig.crawler.browserWebSocketUrl, {
|
||||
// Important: using slowMo to ensure stability with remote browser
|
||||
slowMo: 100,
|
||||
timeout: 5000,
|
||||
});
|
||||
} else if (serverConfig.crawler.browserWebUrl) {
|
||||
logger.info(
|
||||
`[Crawler] Connecting to existing browser instance: ${serverConfig.crawler.browserWebUrl}`,
|
||||
);
|
||||
|
||||
const webUrl = new URL(serverConfig.crawler.browserWebUrl);
|
||||
// We need to resolve the ip address as a workaround for https://github.com/puppeteer/puppeteer/issues/2242
|
||||
const { address } = await dns.promises.lookup(webUrl.hostname);
|
||||
webUrl.hostname = address;
|
||||
logger.info(
|
||||
`[Crawler] Successfully resolved IP address, new address: ${webUrl.toString()}`,
|
||||
);
|
||||
return puppeteer.connect({
|
||||
browserURL: webUrl.toString(),
|
||||
defaultViewport,
|
||||
|
||||
return await chromium.connectOverCDP(webUrl.toString(), {
|
||||
// Important: using slowMo to ensure stability with remote browser
|
||||
slowMo: 100,
|
||||
timeout: 5000,
|
||||
});
|
||||
} else {
|
||||
logger.info(`Running in browserless mode`);
|
||||
@ -141,12 +140,12 @@ async function launchBrowser() {
|
||||
globalBrowser?.on("disconnected", () => {
|
||||
if (isShuttingDown) {
|
||||
logger.info(
|
||||
"[Crawler] The puppeteer browser got disconnected. But we're shutting down so won't restart it.",
|
||||
"[Crawler] The Playwright browser got disconnected. But we're shutting down so won't restart it.",
|
||||
);
|
||||
return;
|
||||
}
|
||||
logger.info(
|
||||
"[Crawler] The puppeteer browser got disconnected. Will attempt to launch it again.",
|
||||
"[Crawler] The Playwright browser got disconnected. Will attempt to launch it again.",
|
||||
);
|
||||
launchBrowser();
|
||||
});
|
||||
@ -155,11 +154,11 @@ async function launchBrowser() {
|
||||
|
||||
export class CrawlerWorker {
|
||||
static async build() {
|
||||
puppeteer.use(StealthPlugin());
|
||||
chromium.use(StealthPlugin());
|
||||
if (serverConfig.crawler.enableAdblocker) {
|
||||
try {
|
||||
logger.info("[crawler] Loading adblocker ...");
|
||||
globalBlocker = await PuppeteerBlocker.fromPrebuiltFull(fetch, {
|
||||
globalBlocker = await PlaywrightBlocker.fromPrebuiltFull(fetch, {
|
||||
path: path.join(os.tmpdir(), "karakeep_adblocker.bin"),
|
||||
read: fs.readFile,
|
||||
write: fs.writeFile,
|
||||
@ -287,39 +286,45 @@ async function crawlPage(
|
||||
if (!browser) {
|
||||
return browserlessCrawlPage(jobId, url, abortSignal);
|
||||
}
|
||||
const context = await browser.createBrowserContext();
|
||||
|
||||
const context = await browser.newContext({
|
||||
viewport: { width: 1440, height: 900 },
|
||||
userAgent:
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
});
|
||||
try {
|
||||
// Create a new page in the context
|
||||
const page = await context.newPage();
|
||||
|
||||
// Apply ad blocking
|
||||
if (globalBlocker) {
|
||||
await globalBlocker.enableBlockingInPage(page);
|
||||
}
|
||||
await page.setUserAgent(
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
|
||||
);
|
||||
|
||||
// Navigate to the target URL
|
||||
logger.info(`[Crawler][${jobId}] Navigating to "${url}"`);
|
||||
const response = await page.goto(url, {
|
||||
timeout: serverConfig.crawler.navigateTimeoutSec * 1000,
|
||||
waitUntil: "domcontentloaded",
|
||||
});
|
||||
|
||||
logger.info(
|
||||
`[Crawler][${jobId}] Successfully navigated to "${url}". Waiting for the page to load ...`,
|
||||
);
|
||||
|
||||
// Wait until there's at most two connections for 2 seconds
|
||||
// Attempt to wait only for 5 seconds
|
||||
// Wait until network is relatively idle or timeout after 5 seconds
|
||||
await Promise.race([
|
||||
page.waitForNetworkIdle({
|
||||
idleTime: 1000, // 1 sec
|
||||
concurrency: 2,
|
||||
}),
|
||||
new Promise((f) => setTimeout(f, 5000)),
|
||||
page.waitForLoadState("networkidle", { timeout: 5000 }).catch(() => ({})),
|
||||
new Promise((resolve) => setTimeout(resolve, 5000)),
|
||||
]);
|
||||
|
||||
logger.info(`[Crawler][${jobId}] Finished waiting for the page to load.`);
|
||||
|
||||
// Extract content from the page
|
||||
const htmlContent = await page.content();
|
||||
logger.info(`[Crawler][${jobId}] Successfully fetched the page content.`);
|
||||
|
||||
// Take a screenshot if configured
|
||||
let screenshot: Buffer | undefined = undefined;
|
||||
if (serverConfig.crawler.storeScreenshot) {
|
||||
try {
|
||||
@ -327,7 +332,6 @@ async function crawlPage(
|
||||
page.screenshot({
|
||||
// If you change this, you need to change the asset type in the store function.
|
||||
type: "png",
|
||||
encoding: "binary",
|
||||
fullPage: serverConfig.crawler.fullPageScreenshot,
|
||||
}),
|
||||
new Promise((_, reject) =>
|
||||
@ -358,6 +362,7 @@ async function crawlPage(
|
||||
};
|
||||
} finally {
|
||||
await context.close();
|
||||
// Only close the browser if it was created on demand
|
||||
if (serverConfig.crawler.browserConnectOnDemand) {
|
||||
await browser.close();
|
||||
}
|
||||
|
||||
@ -13,7 +13,7 @@ RUN apk add --no-cache libc6-compat make g++ py3-pip linux-headers
|
||||
|
||||
COPY . .
|
||||
ENV NEXT_TELEMETRY_DISABLED 1
|
||||
ENV PUPPETEER_SKIP_DOWNLOAD true
|
||||
ENV PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1
|
||||
RUN pnpm install --frozen-lockfile
|
||||
|
||||
# Build the db migration script
|
||||
|
||||
@ -11,4 +11,4 @@ RUN corepack enable
|
||||
|
||||
COPY . .
|
||||
ENV NEXT_TELEMETRY_DISABLED 1
|
||||
ENV PUPPETEER_SKIP_DOWNLOAD true
|
||||
ENV PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=1
|
||||
|
||||
@ -20,6 +20,8 @@ services:
|
||||
chrome:
|
||||
image: gcr.io/zenika-hub/alpine-chrome:123
|
||||
restart: unless-stopped
|
||||
ports:
|
||||
- 9222:9222
|
||||
command:
|
||||
- --no-sandbox
|
||||
- --disable-gpu
|
||||
|
||||
@ -223,7 +223,7 @@ install_karakeep() {
|
||||
mv karakeep-"$RELEASE" "$INSTALL_DIR" && cd "$APP_DIR"/web
|
||||
corepack enable
|
||||
export NEXT_TELEMETRY_DISABLED=1
|
||||
export PUPPETEER_SKIP_DOWNLOAD="true"
|
||||
export PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD="true"
|
||||
export CI="true"
|
||||
$shh pnpm i --frozen-lockfile
|
||||
$shh pnpm build
|
||||
@ -433,7 +433,7 @@ update_karakeep() {
|
||||
fi
|
||||
corepack enable
|
||||
export NEXT_TELEMETRY_DISABLED=1
|
||||
export PUPPETEER_SKIP_DOWNLOAD="true"
|
||||
export PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD="true"
|
||||
export CI="true"
|
||||
cd "$APP_DIR"/web && $shh pnpm i --frozen-lockfile
|
||||
$shh pnpm build
|
||||
|
||||
602
pnpm-lock.yaml
generated
602
pnpm-lock.yaml
generated
File diff suppressed because it is too large
Load Diff
5
tooling/oxlint/package.json
Normal file
5
tooling/oxlint/package.json
Normal file
@ -0,0 +1,5 @@
|
||||
{
|
||||
"name": "oxlint",
|
||||
"version": "0.0.0",
|
||||
"private": true
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user