-
Notifications
You must be signed in to change notification settings - Fork 0
/
pornAnalizer.py
executable file
·73 lines (62 loc) · 2.52 KB
/
pornAnalizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
#!/bin/python
import urllib.request
import http.client
import os
import threading
from queue import Queue
q = Queue()
file_lock = threading.Lock()
# Set the HTTP version to 1.0 (this avoids incomplete read errors)
http.client.HTTPConnection._http_vsn = 10
http.client.HTTPConnection._http_vsn_str = 'HTTP/1.0'
# Command to extract titles from Pornhub's top rated page as of 5/4/2017
# Firstly this isolates the titles themselves, then after some necessary
# transformations, I run sort and uniq to get a count of each word, I then
# eliminate meaningless terms such as and and for, lastly all the data is
# put into csv format
bashExtract = '''cat stats.txt | grep -e '<a href.*title=\".*</a>' | sed 's/.*"//g' |
tr -d '<>' | sed 's/\/a$//g' | sed '/\/a/d' | sed "s/'/\'/g" |
tr ' ' '\n' | tr A-Z a-z | sort | uniq -c | tr -s ' ' | sort -nr |
sed '/[12] /d' | sed '/-$/d' | sed '/ to$/d' | sed '/ of$/d' |
sed '/ it$/d' | sed '/ me$/d' | sed '/ my$/d' | sed '/ in$/d' |
sed '/ with$/d' | sed '/ at$/d' | sed '/ and$/d' | sed '/ &$/d' |
sed '/ the$/d' | sed '/ a$/d' | sed '/ for$/d' | sed 's/^ //g' |
tr ' ' ',' >> results.csv'''
# stats.txt is an intermediate file used to hold GET data
dataFile = open('stats.txt', 'wb+')
def getPage(n):
# As of 5/4/2017 Pornhub's top rated pages use this url format
url = "https://www.pornhub.com/video?o=tr&page=%d"%(n)
# Send an HTTP GET request and read the data
request = urllib.request.urlopen(url)
try:
data = request.read()
except http.client.IncompleteRead as e:
data = e.partial
# Write the data into a file that will be parsed for titles later
with file_lock:
dataFile.write(data)
request.close()
# handles multithreading capabilities, pulls a worker from the queue of tasks
# and pscans the port number specified by that worker
def threader():
while True:
worker = q.get()
getPage(worker)
q.task_done()
for x in range(30):
t = threading.Thread(target=threader)
t.daemon = True
t.start()
for worker in range(1,30):
q.put(worker)
q.join()
# Extract the titles from the raw data and write them to results.txt
os.system('echo value,id > results.csv')
# Adding this empty line ensures that the d3 visualization will not discard the
# real first value
os.system('echo 0,empty >> results.csv')
os.system(bashExtract)
dataFile.close()
# Clean up the intermediate file
os.system('rm stats.txt')