From 0806975e19cbf43f033a162f440c1dcb5518543b Mon Sep 17 00:00:00 2001 From: Shivan Kaul Sahib Date: Tue, 8 Aug 2023 15:13:33 -0700 Subject: [PATCH 1/3] Add recursive parameter to allow crawling recursively Crawl up to a depth of `r` by choosing a link at random from the page --- src/brave/crawl.ts | 47 +++++++++++++++++++++++++++++++++++++++++-- src/brave/validate.ts | 2 ++ src/declarations.d.ts | 3 ++- src/run.ts | 4 ++++ 4 files changed, 53 insertions(+), 3 deletions(-) diff --git a/src/brave/crawl.ts b/src/brave/crawl.ts index 34c9587..3e77a00 100644 --- a/src/brave/crawl.ts +++ b/src/brave/crawl.ts @@ -44,6 +44,8 @@ const setupEnv = (args: CrawlArgs): EnvHandle => { export const writeGraphsForCrawl = async (args: CrawlArgs): Promise => { const logger = getLogger(args) const url: Url = args.urls[0] + const depth = args.recursiveDepth || 1 + let randomChildUrl: Url = null const { puppeteerArgs, pathForProfile, shouldClean } = puppeteerConfigForArgs(args) @@ -77,14 +79,16 @@ export const writeGraphsForCrawl = async (args: CrawlArgs): Promise => { logger.debug(`generatePageGraph { size: ${response.data.length} }`) const outputFilename = pathLib.join( args.outputPath, - `page_graph_${url.replace(/[^\w]/g, '_')}_${Math.floor( + `page_graph_${url?.replace(/[^\w]/g, '_')}_${Math.floor( Date.now() / 1000 )}.graphml` ) fsExtraLib.writeFile(outputFilename, response.data).catch((err: Error) => { logger.debug('ERROR saving Page.generatePageGraph output:', err) }) - + if (depth > 1) { + randomChildUrl = await getRandomLinkFromPage(page, logger) + } logger.debug('Closing page') await page.close() } catch (err) { @@ -102,4 +106,43 @@ export const writeGraphsForCrawl = async (args: CrawlArgs): Promise => { fsExtraLib.remove(pathForProfile) } } + if (randomChildUrl) { + const newArgs = { ...args } + newArgs.urls = [randomChildUrl] + newArgs.recursiveDepth = depth - 1 + await writeGraphsForCrawl(newArgs) + } +} + +const getRandomLinkFromPage = async (page: any, logger: Logger) : Promise /* puppeteer Page */ => { + let rawLinks + try { + rawLinks = await page.$$('a[href]') + } catch (e) { + logger.debug(`Unable to look for child links, page closed: ${e.toString()}`) + return null + } + const links = [] + for (const link of rawLinks) { + const hrefHandle = await link.getProperty('href') + const hrefValue = await hrefHandle.jsonValue() + try { + const hrefUrl = new URL(hrefValue.trim()) + hrefUrl.hash = '' + hrefUrl.search = '' + if (hrefUrl.protocol !== 'http:' && hrefUrl.protocol !== 'https:') { + continue + } + const childUrlString = hrefUrl.toString() + if (!childUrlString || childUrlString.length === 0) { + continue + } + links.push(childUrlString) + } catch (_) { + continue + } + } + // https://stackoverflow.com/a/4550514 + const randomLink = links[Math.floor(Math.random() * links.length)] + return randomLink } diff --git a/src/brave/validate.ts b/src/brave/validate.ts index 482f79d..682d350 100644 --- a/src/brave/validate.ts +++ b/src/brave/validate.ts @@ -54,6 +54,7 @@ export const validate = (rawArgs: any): ValidationResult => { } const urls: Url[] = passedUrlArgs const secs: number = rawArgs.secs + const recursiveDepth: number = rawArgs.recursive_depth const interactive: boolean = rawArgs.interactive const userAgent: string | undefined = rawArgs.user_agent @@ -61,6 +62,7 @@ export const validate = (rawArgs: any): ValidationResult => { executablePath, outputPath, urls, + recursiveDepth, seconds: secs, withShieldsUp: (rawArgs.shields === 'up'), debugLevel: rawArgs.debug, diff --git a/src/declarations.d.ts b/src/declarations.d.ts index a17718e..1ff5d0b 100644 --- a/src/declarations.d.ts +++ b/src/declarations.d.ts @@ -5,7 +5,7 @@ declare module 'puppeteer-core' declare module 'tmp' declare module 'xvfb' -type Url = string +type Url = string | null type FilePath = string type ErrorMsg = string type DebugLevel = 'none' | 'debug' | 'verbose' @@ -14,6 +14,7 @@ interface CrawlArgs { executablePath: FilePath, outputPath: FilePath, urls: Url[], + recursiveDepth: number, withShieldsUp: boolean, debugLevel: DebugLevel, seconds: number, diff --git a/src/run.ts b/src/run.ts index d943024..dd55c58 100755 --- a/src/run.ts +++ b/src/run.ts @@ -18,6 +18,10 @@ parser.addArgument(['-b', '--binary'], { required: true, help: 'Path to the PageGraph enabled build of Brave.' }) +parser.addArgument(['-r', '--recursive-depth'], { + defaultValue: 1, + help: 'If provided and > 1, choose a link at random on page and do another crawl to this depth.' +}) parser.addArgument(['-o', '--output'], { help: 'Path (directory) to write graphs to.', required: true From 8f470f888a976cf32779508a08568e6f3910e615 Mon Sep 17 00:00:00 2001 From: Shivan Kaul Sahib Date: Tue, 8 Aug 2023 15:19:57 -0700 Subject: [PATCH 2/3] Change name of method, clarify help text --- src/brave/crawl.ts | 4 ++-- src/run.ts | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/brave/crawl.ts b/src/brave/crawl.ts index 3e77a00..a600057 100644 --- a/src/brave/crawl.ts +++ b/src/brave/crawl.ts @@ -41,7 +41,7 @@ const setupEnv = (args: CrawlArgs): EnvHandle => { } } -export const writeGraphsForCrawl = async (args: CrawlArgs): Promise => { +export const doCrawl = async (args: CrawlArgs): Promise => { const logger = getLogger(args) const url: Url = args.urls[0] const depth = args.recursiveDepth || 1 @@ -110,7 +110,7 @@ export const writeGraphsForCrawl = async (args: CrawlArgs): Promise => { const newArgs = { ...args } newArgs.urls = [randomChildUrl] newArgs.recursiveDepth = depth - 1 - await writeGraphsForCrawl(newArgs) + await doCrawl(newArgs) } } diff --git a/src/run.ts b/src/run.ts index dd55c58..7760564 100755 --- a/src/run.ts +++ b/src/run.ts @@ -2,7 +2,7 @@ import argparseLib from 'argparse' -import { writeGraphsForCrawl } from './brave/crawl.js' +import { doCrawl } from './brave/crawl.js' import { validate } from './brave/validate.js' const defaultCrawlSecs = 30 @@ -20,7 +20,7 @@ parser.addArgument(['-b', '--binary'], { }) parser.addArgument(['-r', '--recursive-depth'], { defaultValue: 1, - help: 'If provided and > 1, choose a link at random on page and do another crawl to this depth.' + help: 'If provided, choose a link at random on page and do another crawl to this depth. Default: 1 (no recursion).' }) parser.addArgument(['-o', '--output'], { help: 'Path (directory) to write graphs to.', @@ -81,4 +81,4 @@ if (!isValid) { } const crawlArgs = errorOrArgs as CrawlArgs -writeGraphsForCrawl(crawlArgs) +doCrawl(crawlArgs) From d6c33dc2fe4760a62c62bd71cd4daaa16172fbef Mon Sep 17 00:00:00 2001 From: Shivan Kaul Sahib Date: Tue, 8 Aug 2023 15:21:37 -0700 Subject: [PATCH 3/3] Update README --- README.md | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 5edc9ad..a3acd8d 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,15 @@ Usage --- ``` -usage: run.js [-h] [-v] -b BINARY -o OUTPUT -u URL [URL ...] - [-e EXISTING_PROFILE] [-p PERSIST_PROFILE] [-s {up,down}] - [-t SECS] [--debug {none,debug,verbose}] +$ npm run crawl -- -h + +> pagegraph-crawl@1.0.0 crawl +> node ./built/run.js + +usage: run.js [-h] [-v] -b BINARY [-r RECURSIVE_DEPTH] -o OUTPUT -u URL + [URL ...] [-e EXISTING_PROFILE] [-p PERSIST_PROFILE] + [-s {up,down}] [-t SECS] [--debug {none,debug,verbose}] [-i] + [-a USER_AGENT] [--proxy-server URL] [-x JSON_ARRAY] CLI tool for crawling and recording websites with PageGraph @@ -20,9 +26,13 @@ Optional arguments: -h, --help Show this help message and exit. -v, --version Show program's version number and exit. -b BINARY, --binary BINARY - Path to the PageGraph-enabled build of Brave. + Path to the PageGraph enabled build of Brave. + -r RECURSIVE_DEPTH, --recursive-depth RECURSIVE_DEPTH + If provided, choose a link at random on page and do + another crawl to this depth. Default: 1 (no + recursion). -o OUTPUT, --output OUTPUT - Path to write graphs to. + Path (directory) to write graphs to. -u URL [URL ...], --url URL [URL ...] The URLs(s) to record, in desired order (currently only crawls the first URL) @@ -38,4 +48,12 @@ Optional arguments: -t SECS, --secs SECS The dwell time in seconds. Defaults: 30 sec. --debug {none,debug,verbose} Print debugging information. Default: none. + -i, --interactive Suppress use of Xvfb to allow interaction with + spawned browser instance + -a USER_AGENT, --user-agent USER_AGENT + Override the browser's UserAgent string to USER_AGENT + --proxy-server URL Use an HTTP/SOCKS proxy at URL for all navigations + -x JSON_ARRAY, --extra-args JSON_ARRAY + Pass JSON_ARRAY as extra CLI argument to the browser + instance launched ```