Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add recursive parameter to allow crawling recursively #26

Merged
merged 3 commits into from
Aug 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 23 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,15 @@ Usage
---

```
usage: run.js [-h] [-v] -b BINARY -o OUTPUT -u URL [URL ...]
[-e EXISTING_PROFILE] [-p PERSIST_PROFILE] [-s {up,down}]
[-t SECS] [--debug {none,debug,verbose}]
$ npm run crawl -- -h

> [email protected] crawl
> node ./built/run.js

usage: run.js [-h] [-v] -b BINARY [-r RECURSIVE_DEPTH] -o OUTPUT -u URL
[URL ...] [-e EXISTING_PROFILE] [-p PERSIST_PROFILE]
[-s {up,down}] [-t SECS] [--debug {none,debug,verbose}] [-i]
[-a USER_AGENT] [--proxy-server URL] [-x JSON_ARRAY]


CLI tool for crawling and recording websites with PageGraph
Expand All @@ -20,9 +26,13 @@ Optional arguments:
-h, --help Show this help message and exit.
-v, --version Show program's version number and exit.
-b BINARY, --binary BINARY
Path to the PageGraph-enabled build of Brave.
Path to the PageGraph enabled build of Brave.
-r RECURSIVE_DEPTH, --recursive-depth RECURSIVE_DEPTH
If provided, choose a link at random on page and do
another crawl to this depth. Default: 1 (no
recursion).
-o OUTPUT, --output OUTPUT
Path to write graphs to.
Path (directory) to write graphs to.
-u URL [URL ...], --url URL [URL ...]
The URLs(s) to record, in desired order (currently
only crawls the first URL)
Expand All @@ -38,4 +48,12 @@ Optional arguments:
-t SECS, --secs SECS The dwell time in seconds. Defaults: 30 sec.
--debug {none,debug,verbose}
Print debugging information. Default: none.
-i, --interactive Suppress use of Xvfb to allow interaction with
spawned browser instance
-a USER_AGENT, --user-agent USER_AGENT
Override the browser's UserAgent string to USER_AGENT
--proxy-server URL Use an HTTP/SOCKS proxy at URL for all navigations
-x JSON_ARRAY, --extra-args JSON_ARRAY
Pass JSON_ARRAY as extra CLI argument to the browser
instance launched
```
49 changes: 46 additions & 3 deletions src/brave/crawl.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,11 @@ const setupEnv = (args: CrawlArgs): EnvHandle => {
}
}

export const writeGraphsForCrawl = async (args: CrawlArgs): Promise<void> => {
export const doCrawl = async (args: CrawlArgs): Promise<void> => {
const logger = getLogger(args)
const url: Url = args.urls[0]
const depth = args.recursiveDepth || 1
let randomChildUrl: Url = null

const { puppeteerArgs, pathForProfile, shouldClean } = puppeteerConfigForArgs(args)

Expand Down Expand Up @@ -77,14 +79,16 @@ export const writeGraphsForCrawl = async (args: CrawlArgs): Promise<void> => {
logger.debug(`generatePageGraph { size: ${response.data.length} }`)
const outputFilename = pathLib.join(
args.outputPath,
`page_graph_${url.replace(/[^\w]/g, '_')}_${Math.floor(
`page_graph_${url?.replace(/[^\w]/g, '_')}_${Math.floor(
Date.now() / 1000
)}.graphml`
)
fsExtraLib.writeFile(outputFilename, response.data).catch((err: Error) => {
logger.debug('ERROR saving Page.generatePageGraph output:', err)
})

if (depth > 1) {
randomChildUrl = await getRandomLinkFromPage(page, logger)
}
logger.debug('Closing page')
await page.close()
} catch (err) {
Expand All @@ -102,4 +106,43 @@ export const writeGraphsForCrawl = async (args: CrawlArgs): Promise<void> => {
fsExtraLib.remove(pathForProfile)
}
}
if (randomChildUrl) {
const newArgs = { ...args }
newArgs.urls = [randomChildUrl]
newArgs.recursiveDepth = depth - 1
await doCrawl(newArgs)
}
}

const getRandomLinkFromPage = async (page: any, logger: Logger) : Promise<Url> /* puppeteer Page */ => {
let rawLinks
try {
rawLinks = await page.$$('a[href]')
} catch (e) {
logger.debug(`Unable to look for child links, page closed: ${e.toString()}`)
return null
}
const links = []
for (const link of rawLinks) {
const hrefHandle = await link.getProperty('href')
const hrefValue = await hrefHandle.jsonValue()
try {
const hrefUrl = new URL(hrefValue.trim())
hrefUrl.hash = ''
hrefUrl.search = ''
if (hrefUrl.protocol !== 'http:' && hrefUrl.protocol !== 'https:') {
continue
}
const childUrlString = hrefUrl.toString()
if (!childUrlString || childUrlString.length === 0) {
continue
}
links.push(childUrlString)
} catch (_) {
continue
}
}
// https://stackoverflow.com/a/4550514
const randomLink = links[Math.floor(Math.random() * links.length)]
return randomLink
}
2 changes: 2 additions & 0 deletions src/brave/validate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,15 @@ export const validate = (rawArgs: any): ValidationResult => {
}
const urls: Url[] = passedUrlArgs
const secs: number = rawArgs.secs
const recursiveDepth: number = rawArgs.recursive_depth
const interactive: boolean = rawArgs.interactive
const userAgent: string | undefined = rawArgs.user_agent

const validatedArgs: CrawlArgs = {
executablePath,
outputPath,
urls,
recursiveDepth,
seconds: secs,
withShieldsUp: (rawArgs.shields === 'up'),
debugLevel: rawArgs.debug,
Expand Down
3 changes: 2 additions & 1 deletion src/declarations.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ declare module 'puppeteer-core'
declare module 'tmp'
declare module 'xvfb'

type Url = string
type Url = string | null
type FilePath = string
type ErrorMsg = string
type DebugLevel = 'none' | 'debug' | 'verbose'
Expand All @@ -14,6 +14,7 @@ interface CrawlArgs {
executablePath: FilePath,
outputPath: FilePath,
urls: Url[],
recursiveDepth: number,
withShieldsUp: boolean,
debugLevel: DebugLevel,
seconds: number,
Expand Down
8 changes: 6 additions & 2 deletions src/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import argparseLib from 'argparse'

import { writeGraphsForCrawl } from './brave/crawl.js'
import { doCrawl } from './brave/crawl.js'
import { validate } from './brave/validate.js'

const defaultCrawlSecs = 30
Expand All @@ -18,6 +18,10 @@ parser.addArgument(['-b', '--binary'], {
required: true,
help: 'Path to the PageGraph enabled build of Brave.'
})
parser.addArgument(['-r', '--recursive-depth'], {
defaultValue: 1,
help: 'If provided, choose a link at random on page and do another crawl to this depth. Default: 1 (no recursion).'
})
parser.addArgument(['-o', '--output'], {
help: 'Path (directory) to write graphs to.',
required: true
Expand Down Expand Up @@ -77,4 +81,4 @@ if (!isValid) {
}

const crawlArgs = errorOrArgs as CrawlArgs
writeGraphsForCrawl(crawlArgs)
doCrawl(crawlArgs)