Skip to content

Commit

Permalink
Add recursive parameter to allow crawling recursively (#26)
Browse files Browse the repository at this point in the history
Crawl up to a depth of `r` by choosing a link at random from the page
Update README
  • Loading branch information
ShivanKaul authored Aug 8, 2023
1 parent b29b8d6 commit a917a11
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 11 deletions.
28 changes: 23 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,15 @@ Usage
---

```
usage: run.js [-h] [-v] -b BINARY -o OUTPUT -u URL [URL ...]
[-e EXISTING_PROFILE] [-p PERSIST_PROFILE] [-s {up,down}]
[-t SECS] [--debug {none,debug,verbose}]
$ npm run crawl -- -h
> [email protected] crawl
> node ./built/run.js
usage: run.js [-h] [-v] -b BINARY [-r RECURSIVE_DEPTH] -o OUTPUT -u URL
[URL ...] [-e EXISTING_PROFILE] [-p PERSIST_PROFILE]
[-s {up,down}] [-t SECS] [--debug {none,debug,verbose}] [-i]
[-a USER_AGENT] [--proxy-server URL] [-x JSON_ARRAY]
CLI tool for crawling and recording websites with PageGraph
Expand All @@ -20,9 +26,13 @@ Optional arguments:
-h, --help Show this help message and exit.
-v, --version Show program's version number and exit.
-b BINARY, --binary BINARY
Path to the PageGraph-enabled build of Brave.
Path to the PageGraph enabled build of Brave.
-r RECURSIVE_DEPTH, --recursive-depth RECURSIVE_DEPTH
If provided, choose a link at random on page and do
another crawl to this depth. Default: 1 (no
recursion).
-o OUTPUT, --output OUTPUT
Path to write graphs to.
Path (directory) to write graphs to.
-u URL [URL ...], --url URL [URL ...]
The URLs(s) to record, in desired order (currently
only crawls the first URL)
Expand All @@ -38,4 +48,12 @@ Optional arguments:
-t SECS, --secs SECS The dwell time in seconds. Defaults: 30 sec.
--debug {none,debug,verbose}
Print debugging information. Default: none.
-i, --interactive Suppress use of Xvfb to allow interaction with
spawned browser instance
-a USER_AGENT, --user-agent USER_AGENT
Override the browser's UserAgent string to USER_AGENT
--proxy-server URL Use an HTTP/SOCKS proxy at URL for all navigations
-x JSON_ARRAY, --extra-args JSON_ARRAY
Pass JSON_ARRAY as extra CLI argument to the browser
instance launched
```
49 changes: 46 additions & 3 deletions src/brave/crawl.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,9 +41,11 @@ const setupEnv = (args: CrawlArgs): EnvHandle => {
}
}

export const writeGraphsForCrawl = async (args: CrawlArgs): Promise<void> => {
export const doCrawl = async (args: CrawlArgs): Promise<void> => {
const logger = getLogger(args)
const url: Url = args.urls[0]
const depth = args.recursiveDepth || 1
let randomChildUrl: Url = null

const { puppeteerArgs, pathForProfile, shouldClean } = puppeteerConfigForArgs(args)

Expand Down Expand Up @@ -77,14 +79,16 @@ export const writeGraphsForCrawl = async (args: CrawlArgs): Promise<void> => {
logger.debug(`generatePageGraph { size: ${response.data.length} }`)
const outputFilename = pathLib.join(
args.outputPath,
`page_graph_${url.replace(/[^\w]/g, '_')}_${Math.floor(
`page_graph_${url?.replace(/[^\w]/g, '_')}_${Math.floor(
Date.now() / 1000
)}.graphml`
)
fsExtraLib.writeFile(outputFilename, response.data).catch((err: Error) => {
logger.debug('ERROR saving Page.generatePageGraph output:', err)
})

if (depth > 1) {
randomChildUrl = await getRandomLinkFromPage(page, logger)
}
logger.debug('Closing page')
await page.close()
} catch (err) {
Expand All @@ -102,4 +106,43 @@ export const writeGraphsForCrawl = async (args: CrawlArgs): Promise<void> => {
fsExtraLib.remove(pathForProfile)
}
}
if (randomChildUrl) {
const newArgs = { ...args }
newArgs.urls = [randomChildUrl]
newArgs.recursiveDepth = depth - 1
await doCrawl(newArgs)
}
}

const getRandomLinkFromPage = async (page: any, logger: Logger) : Promise<Url> /* puppeteer Page */ => {
let rawLinks
try {
rawLinks = await page.$$('a[href]')
} catch (e) {
logger.debug(`Unable to look for child links, page closed: ${e.toString()}`)
return null
}
const links = []
for (const link of rawLinks) {
const hrefHandle = await link.getProperty('href')
const hrefValue = await hrefHandle.jsonValue()
try {
const hrefUrl = new URL(hrefValue.trim())
hrefUrl.hash = ''
hrefUrl.search = ''
if (hrefUrl.protocol !== 'http:' && hrefUrl.protocol !== 'https:') {
continue
}
const childUrlString = hrefUrl.toString()
if (!childUrlString || childUrlString.length === 0) {
continue
}
links.push(childUrlString)
} catch (_) {
continue
}
}
// https://stackoverflow.com/a/4550514
const randomLink = links[Math.floor(Math.random() * links.length)]
return randomLink
}
2 changes: 2 additions & 0 deletions src/brave/validate.ts
Original file line number Diff line number Diff line change
Expand Up @@ -54,13 +54,15 @@ export const validate = (rawArgs: any): ValidationResult => {
}
const urls: Url[] = passedUrlArgs
const secs: number = rawArgs.secs
const recursiveDepth: number = rawArgs.recursive_depth
const interactive: boolean = rawArgs.interactive
const userAgent: string | undefined = rawArgs.user_agent

const validatedArgs: CrawlArgs = {
executablePath,
outputPath,
urls,
recursiveDepth,
seconds: secs,
withShieldsUp: (rawArgs.shields === 'up'),
debugLevel: rawArgs.debug,
Expand Down
3 changes: 2 additions & 1 deletion src/declarations.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ declare module 'puppeteer-core'
declare module 'tmp'
declare module 'xvfb'

type Url = string
type Url = string | null
type FilePath = string
type ErrorMsg = string
type DebugLevel = 'none' | 'debug' | 'verbose'
Expand All @@ -14,6 +14,7 @@ interface CrawlArgs {
executablePath: FilePath,
outputPath: FilePath,
urls: Url[],
recursiveDepth: number,
withShieldsUp: boolean,
debugLevel: DebugLevel,
seconds: number,
Expand Down
8 changes: 6 additions & 2 deletions src/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import argparseLib from 'argparse'

import { writeGraphsForCrawl } from './brave/crawl.js'
import { doCrawl } from './brave/crawl.js'
import { validate } from './brave/validate.js'

const defaultCrawlSecs = 30
Expand All @@ -18,6 +18,10 @@ parser.addArgument(['-b', '--binary'], {
required: true,
help: 'Path to the PageGraph enabled build of Brave.'
})
parser.addArgument(['-r', '--recursive-depth'], {
defaultValue: 1,
help: 'If provided, choose a link at random on page and do another crawl to this depth. Default: 1 (no recursion).'
})
parser.addArgument(['-o', '--output'], {
help: 'Path (directory) to write graphs to.',
required: true
Expand Down Expand Up @@ -77,4 +81,4 @@ if (!isValid) {
}

const crawlArgs = errorOrArgs as CrawlArgs
writeGraphsForCrawl(crawlArgs)
doCrawl(crawlArgs)

0 comments on commit a917a11

Please sign in to comment.