Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add tests and print out graphs on redirects #27

Merged
merged 5 commits into from
Aug 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,19 @@ Command line tool for crawling with PageGraph. It does not include a
PageGraph enabled build though; you can point it at the latest Nightly
version of Brave.

Install
---
```bash
npm run install
```

Test
---
```bash
npm run test
```
The tests are defined in `test/test.js`. Test parameters are defined in `test/config.js` and can be overriden via environment variables. You need to specify a pagegraph binary path.

Usage
---

Expand Down
5,240 changes: 4,215 additions & 1,025 deletions package-lock.json

Large diffs are not rendered by default.

6 changes: 5 additions & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
"lint:fix": "standardx --fix src/**/*.ts src/*.ts",
"build": "tsc",
"postinstall": "npm run build",
"crawl": "node ./built/run.js"
"crawl": "node ./built/run.js",
"test": "npm run build && mocha test/test.js --timeout 60000"
},
"type": "module",
"author": "",
Expand All @@ -26,6 +27,9 @@
"devDependencies": {
"@typescript-eslint/eslint-plugin": "^3.3.0",
"@typescript-eslint/parser": "^3.3.0",
"chai": "^4.3.6",
"express": "^4.18.2",
"mocha": "^10.0.0",
"standardx": "^7.0.0"
},
"eslintConfig": {
Expand Down
72 changes: 53 additions & 19 deletions src/brave/crawl.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,36 @@ const setupEnv = (args: CrawlArgs): EnvHandle => {
}
}

async function generatePageGraph (seconds: number, page: any, client: any, logger: Logger) {
const waitTimeMs = seconds * 1000
logger.debug(`Waiting for ${waitTimeMs}ms`)
await page.waitFor(waitTimeMs)
logger.debug('calling generatePageGraph')
const response = await client.send('Page.generatePageGraph')
logger.debug(`generatePageGraph { size: ${response.data.length} }`)
return response
}

function createFilename (url: Url) : FilePath {
return `page_graph_${url?.replace(/[^\w]/g, '_')}_${Math.floor(Date.now() / 1000)}.graphml`
}

function writeToFile (args: CrawlArgs, url: Url, response: any, logger: Logger) {
const outputFilename = pathLib.join(
args.outputPath,
createFilename(url)

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reported by reviewdog 🐶
[semgrep] Detected possible user input going into a path.join or path.resolve function. This could possibly lead to a path traversal vulnerability, where the attacker can access arbitrary files stored in the file system. Instead, be sure to sanitize or validate user input first.

Source: https://semgrep.dev/r/javascript.lang.security.audit.path-traversal.path-join-resolve-traversal.path-join-resolve-traversal


Cc @thypon @bcaller

)
fsExtraLib.writeFile(outputFilename, response.data).catch((err: Error) => {
logger.debug('ERROR saving Page.generatePageGraph output:', err)
})
}

export const doCrawl = async (args: CrawlArgs): Promise<void> => {
const logger = getLogger(args)
const url: Url = args.urls[0]
const depth = args.recursiveDepth || 1
let randomChildUrl: Url = null
let redirectedUrl: Url = null

const { puppeteerArgs, pathForProfile, shouldClean } = puppeteerConfigForArgs(args)

Expand All @@ -67,25 +92,29 @@ export const doCrawl = async (args: CrawlArgs): Promise<void> => {
await page.setUserAgent(args.userAgent)
}

logger.debug(`Navigating to ${url}`)
await page.goto(url, { waitUntil: 'domcontentloaded' })

const waitTimeMs = args.seconds * 1000
logger.debug(`Waiting for ${waitTimeMs}ms`)
await page.waitFor(waitTimeMs)

logger.debug('calling generatePageGraph')
const response = await client.send('Page.generatePageGraph')
logger.debug(`generatePageGraph { size: ${response.data.length} }`)
const outputFilename = pathLib.join(
args.outputPath,
`page_graph_${url?.replace(/[^\w]/g, '_')}_${Math.floor(
Date.now() / 1000
)}.graphml`
)
fsExtraLib.writeFile(outputFilename, response.data).catch((err: Error) => {
logger.debug('ERROR saving Page.generatePageGraph output:', err)
await page.setRequestInterception(true)
// First load is not a navigation redirect, so we need to skip it.
let firstLoad = true
page.on('request', async (request: any) => {
const parentFrame = request.frame().parentFrame()
// Only capture parent frame navigation requests.
logger.debug(`Request intercepted: ${request.url()}, first load: ${firstLoad}`)
if (request.isNavigationRequest() && parentFrame === null && !firstLoad) {
logger.debug('Page is redirecting...')
redirectedUrl = request.url()
// Stop page load
logger.debug(`Stopping page load of ${url}`)
await page._client.send('Page.stopLoading')
}
firstLoad = false
request.continue()
})

logger.debug(`Navigating to ${url}`)
await page.goto(url, { waitUntil: 'load' })
logger.debug(`Loaded ${url}`)
const response = await generatePageGraph(args.seconds, page, client, logger)
writeToFile(args, url, response, logger)
if (depth > 1) {
randomChildUrl = await getRandomLinkFromPage(page, logger)
}
Expand All @@ -101,11 +130,16 @@ export const doCrawl = async (args: CrawlArgs): Promise<void> => {
logger.debug('ERROR runtime fiasco from infrastructure:', err)
} finally {
envHandle.close()

if (shouldClean) {
fsExtraLib.remove(pathForProfile)
}
}
if (redirectedUrl) {
const newArgs = { ...args }
newArgs.urls = [redirectedUrl]
logger.debug(`Doing new crawl with redirected URL: ${redirectedUrl}`)
await doCrawl(newArgs)
}
if (randomChildUrl) {
const newArgs = { ...args }
newArgs.urls = [randomChildUrl]
Expand Down
6 changes: 6 additions & 0 deletions test/config.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
export const config = {
debug: false,
port: 3000,
baseUrl: 'http://localhost',
pagegraph: '/Applications/Brave\\ Browser\\ Nightly.app/Contents/MacOS/Brave\\ Browser\\ Nightly'
}
1 change: 1 addition & 0 deletions test/pages/generate-index.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tree -H '.' -L 1 -t --noreport --charset utf-8 -P "*.html" -I "index.html" -o index.html
49 changes: 49 additions & 0 deletions test/pages/index.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
<!DOCTYPE html>
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
<meta name="Author" content="Made by 'tree'">
<meta name="GENERATOR" content="$Version: $ tree v1.8.0 (c) 1996 - 2018 by Steve Baker, Thomas Moore, Francesc Rocher, Florian Sesser, Kyosuke Tokoro $">
<title>Directory Tree</title>
<style type="text/css">
<!--
BODY { font-family : ariel, monospace, sans-serif; }
P { font-weight: normal; font-family : ariel, monospace, sans-serif; color: black; background-color: transparent;}
B { font-weight: normal; color: black; background-color: transparent;}
A:visited { font-weight : normal; text-decoration : none; background-color : transparent; margin : 0px 0px 0px 0px; padding : 0px 0px 0px 0px; display: inline; }
A:link { font-weight : normal; text-decoration : none; margin : 0px 0px 0px 0px; padding : 0px 0px 0px 0px; display: inline; }
A:hover { color : #000000; font-weight : normal; text-decoration : underline; background-color : yellow; margin : 0px 0px 0px 0px; padding : 0px 0px 0px 0px; display: inline; }
A:active { color : #000000; font-weight: normal; background-color : transparent; margin : 0px 0px 0px 0px; padding : 0px 0px 0px 0px; display: inline; }
.VERSION { font-size: small; font-family : arial, sans-serif; }
.NORM { color: black; background-color: transparent;}
.FIFO { color: purple; background-color: transparent;}
.CHAR { color: yellow; background-color: transparent;}
.DIR { color: blue; background-color: transparent;}
.BLOCK { color: yellow; background-color: transparent;}
.LINK { color: aqua; background-color: transparent;}
.SOCK { color: fuchsia;background-color: transparent;}
.EXEC { color: green; background-color: transparent;}
-->
</style>
</head>
<body>
<h1>Directory Tree</h1><p>
<a href=".">.</a><br>
├── <a href="./simple.html">simple.html</a><br>
├── <a href="./redirect-js-same-site.html">redirect-js-same-site.html</a><br>
├── <a href="./multiple-redirects-js-same-site.html">multiple-redirects-js-same-site.html</a><br>
└── <a href="./redirect-js-cross-site.html">redirect-js-cross-site.html</a><br>
<br><br>
</p>
<p>
<br><br>
</p>
<hr>
<p class="VERSION">
tree v1.8.0 © 1996 - 2018 by Steve Baker and Thomas Moore <br>
HTML output hacked and copyleft © 1998 by Francesc Rocher <br>
JSON output hacked and copyleft © 2014 by Florian Sesser <br>
Charsets / OS/2 support © 2001 by Kyosuke Tokoro
</p>
</body>
</html>
7 changes: 7 additions & 0 deletions test/pages/multiple-redirects-js-same-site.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<html>
<!-- Random token for testing -->
NsybZB0LO4
<script>
window.location.replace("./redirect-js-same-site.html");
</script>
</html>
7 changes: 7 additions & 0 deletions test/pages/redirect-js-cross-site.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<html>
<!-- Random token for testing -->
Zym8MZp
<script>
window.location.replace("http://127.0.0.1:3000/simple.html");
</script>
</html>
7 changes: 7 additions & 0 deletions test/pages/redirect-js-same-site.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<html>
<!-- Random token for testing -->
W0XNNnar
<script>
window.location.replace("./simple.html");
</script>
</html>
4 changes: 4 additions & 0 deletions test/pages/simple.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
<html>
<!-- Random token used for testing -->
hJc9ZK1sGr
</html>
Loading