compute_counts.py 1.45 KB
Newer Older
1
2
3
4
# coding: utf-8
import csv
import sys
from collections import Counter
Shyam Upadhyay's avatar
WIP    
Shyam Upadhyay committed
5
from urlparse import urlparse
6
7
8

f=open(sys.argv[1])
data=csv.reader(f,delimiter='\t')
Shyam Upadhyay's avatar
WIP    
Shyam Upadhyay committed
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
url_cnt=Counter()
cat_cnt=Counter()
url_cat_cnt=Counter()
page_cnt=Counter()
bad=0                           # bad due to race print conditions, should not be too much

def processURL(url):
    parsed_uri = urlparse(url)
    domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
    # if "///" in domain:
    #     print url
    return domain

def isValid(raw_url,page,outlinks,cats):
    if raw_url=="null":
        return False
    if outlinks < 100:
        return False
    return True
28
for row in data:
Shyam Upadhyay's avatar
WIP    
Shyam Upadhyay committed
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
    if len(row)==4:
        raw_url=row[0]
        page=row[1]
        outlinks=int(row[2])
        cats=row[3].strip('[').strip(']').split(',')
        cats = map(lambda x: x.strip(),cats)

        if not isValid(raw_url,page,outlinks,cats):
            continue

        url=processURL(raw_url)
        page_cnt.update((page,))
        # print cats
        cat_cnt.update(cats)
        url_cnt.update((url,))
        for cat in cats:
            url_cat_cnt.update((url+"#"+cat,))
    else:
        # print "BAD",row
        bad+=1

print "botched due to parallel",bad

# for i in url_cnt.most_common(int(sys.argv[2])):
#     print i
# for i in cat_cnt.most_common(int(sys.argv[2])):
#     print i

for i in page_cnt.most_common(10):
    print i,page_cnt[i]

# for i in url_cat_cnt:
#     print i,url_cat_cnt[i]