Commit 26f435bb authored by Shyam Upadhyay's avatar Shyam Upadhyay
Browse files

WIP

parent f9b0acb6
......@@ -2,25 +2,61 @@
import csv
import sys
from collections import Counter
from urlparse import urlparse
f=open(sys.argv[1])
data=csv.reader(f,delimiter='\t')
cnt=Counter()
url_cnt=Counter()
cat_cnt=Counter()
url_cat_cnt=Counter()
page_cnt=Counter()
bad=0 # bad due to race print conditions, should not be too much
def processURL(url):
parsed_uri = urlparse(url)
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
# if "///" in domain:
# print url
return domain
def isValid(raw_url,page,outlinks,cats):
if raw_url=="null":
return False
if outlinks < 100:
return False
return True
for row in data:
# print row
for i in row:
idx=i.find(':')
name=i[0:idx]
val=i[idx+1:]
if name=="URL":
if val!="null":
print val
val=val.strip("http://")
jj=val.find('/')
# print val[0:jj]
cnt.update((val[0:jj],))
else:
print "NULL"
for i in cnt.most_common(int(sys.argv[2])):
print i
if len(row)==4:
raw_url=row[0]
page=row[1]
outlinks=int(row[2])
cats=row[3].strip('[').strip(']').split(',')
cats = map(lambda x: x.strip(),cats)
if not isValid(raw_url,page,outlinks,cats):
continue
url=processURL(raw_url)
page_cnt.update((page,))
# print cats
cat_cnt.update(cats)
url_cnt.update((url,))
for cat in cats:
url_cat_cnt.update((url+"#"+cat,))
else:
# print "BAD",row
bad+=1
print "botched due to parallel",bad
# for i in url_cnt.most_common(int(sys.argv[2])):
# print i
# for i in cat_cnt.most_common(int(sys.argv[2])):
# print i
for i in page_cnt.most_common(10):
print i,page_cnt[i]
# for i in url_cat_cnt:
# print i,url_cat_cnt[i]
......@@ -24,7 +24,7 @@ MAIN="$PACKAGE_PREFIX.wiki.events.CiteMiner"
#MAIN="$PACKAGE_PREFIX.wiki.events.TrainingDataGenerator"
#MAIN="$PACKAGE_PREFIX.wiki.events.baseline.Baseline"
MAIN="$PACKAGE_PREFIX.wiki.events.newsarticle.NewsArticleMiner"
MAIN="$PACKAGE_PREFIX.wiki.events.newsarticle.ReadNothmanAnnotation"
# MAIN="$PACKAGE_PREFIX.wiki.events.newsarticle.ReadNothmanAnnotation"
#MAIN="$PACKAGE_PREFIX.wiki.events.newsarticle.CleanData"
#MAIN="$PACKAGE_PREFIX.wiki.parsing.indexing.simplewiki.SimpleWikipageIndexer"
#MAIN="$PACKAGE_PREFIX.wiki.parsing.indexing.simplewiki.IndexSimpleWikipedia"
......
......@@ -10,6 +10,7 @@ import edu.illinois.cs.cogcomp.wiki.events.indexing.CitationIndexer;
public class ArticleWorker implements Runnable {
private List<String> jobs;
private String dumpath;
@Override
public void run() {
......@@ -22,7 +23,7 @@ public class ArticleWorker implements Runnable {
+ line;
trial++;
try {
String filepath = "nothman-data/newsbusiness" + "/" + md5;
String filepath = dumpath + "/" + md5;
if (NewsArticleMiner.dumpNewsArticle(filepath, urlstr)) {
hit++;
} else {
......@@ -40,7 +41,8 @@ public class ArticleWorker implements Runnable {
}
}
public ArticleWorker(List<String> jobs) {
public ArticleWorker(List<String> jobs, String dumpath) {
this.jobs = jobs;
this.dumpath=dumpath;
}
}
......@@ -44,9 +44,11 @@ import edu.illinois.cs.cogcomp.wikifier.utils.datastructure.StringCounter;
public class NewsArticleMiner {
private String idfilepath;
private String dumpath;
public NewsArticleMiner(String idfilepath) {
public NewsArticleMiner(String idfilepath, String dumpath) {
this.idfilepath = idfilepath;
this.dumpath =dumpath;
}
public static String fetchArticleFromWeb(String urlstr)
......@@ -62,44 +64,47 @@ public class NewsArticleMiner {
}
}
public static String fetchHTMLArticleFromWeb(String urlstr)
throws MalformedURLException {
URL url;
InputStream is = null;
BufferedReader br;
String line;
String content="";
InputStream is = null;
BufferedReader br;
String line;
String content = "";
try {
url = new URL(urlstr);
is = url.openStream(); // throws an IOException
br = new BufferedReader(new InputStreamReader(is));
while ((line = br.readLine()) != null) {
System.out.println(line);
content+=line+"\n";
}
} catch (MalformedURLException mue) {
mue.printStackTrace();
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
try {
if (is != null) is.close();
} catch (IOException ioe) {
// nothing to see here
}
}
url = new URL(urlstr);
is = url.openStream(); // throws an IOException
br = new BufferedReader(new InputStreamReader(is));
while ((line = br.readLine()) != null) {
System.out.println(line);
content += line + "\n";
}
} catch (MalformedURLException mue) {
mue.printStackTrace();
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
try {
if (is != null)
is.close();
} catch (IOException ioe) {
// nothing to see here
}
}
return content;
}
static int empty = 0;
public static boolean dumpNewsArticle(String filepath, String url)
throws FileNotFoundException, MalformedURLException,
BoilerpipeProcessingException {
// String filepath = Experiments.datapath + "/" + md5;
// String filepath = Experiments.datapath + "/" + md5;
if (!IOUtils.exists(filepath)) {
// String content = fetchArticleFromWeb(url);
String content = fetchHTMLArticleFromWeb(url);
String content = fetchArticleFromWeb(url);
// String content = fetchHTMLArticleFromWeb(url);
if (content != null && content.trim().length() >= 20) {
PrintWriter w = new PrintWriter(new File(filepath));
w.println(content);
......@@ -133,7 +138,7 @@ public class NewsArticleMiner {
List<String> subList = lines.subList(i
* (lines.size() / NUM_THREADS), ((i + 1)
* (lines.size() / NUM_THREADS) - 1));
ArticleWorker articleWorker = new ArticleWorker(subList);
ArticleWorker articleWorker = new ArticleWorker(subList,dumpath);
executor.submit(articleWorker);
}
executor.shutdown();
......@@ -167,43 +172,45 @@ public class NewsArticleMiner {
System.out.println(counter.getTopK(10));
}
public static void getNothmanData(String idsFile) throws FileNotFoundException, MalformedURLException
{
public static void getNothmanData(String idsFile)
throws FileNotFoundException, MalformedURLException {
List<String> urllist = LineIO.read(idsFile);
String title;
for(int i=0;i<10;i++)
{
for (int i = 0; i < 10; i++) {
System.out.println(i);
title=urllist.get(i);
PrintWriter w = new PrintWriter("nothman-data/"+title);
w.println(NewsArticleMiner.fetchArticleFromWeb("http://newsstore.smh.com.au/apps/viewDocument.ac?docID="+title));
title = urllist.get(i);
PrintWriter w = new PrintWriter("nothman-data/" + title);
w.println(NewsArticleMiner
.fetchArticleFromWeb("http://newsstore.smh.com.au/apps/viewDocument.ac?docID="
+ title));
w.close();
}
}
public static void main(String[] args)
throws BoilerpipeProcessingException, IOException {
// NewsArticleMiner miner= new NewsArticleMiner(args[0]);
// miner.run();
dumpNewsArticle("nothman-data/test.html","http://newsstore.smh.com.au/apps/viewDocument.ac?docID=SMH020520EV3D919FILN");
//getNothmanData(args[0]);
// newsArticleMiner.testSanityOfCorpus();
// newsArticleMiner.getCorpusStats();
// int c = 0;
// BufferedReader br = new BufferedReader(new FileReader(new File(
// "inIndexAndPopularNews")));
// String line;
// Citation citation = null;
// while ((line = br.readLine()) != null) {
// citation = Citation.gsonInstance.fromJson(line, Citation.class);
// createNewsArticle(citation.getHash(),citation.getURL());
// if(c++ % 100 ==0)
// {
// System.out.println("Done "+c);
// }
// }
NewsArticleMiner miner = new NewsArticleMiner(args[0],args[1]);
miner.run();
// dumpNewsArticle("nothman-data/test.html",
// "http://newsstore.smh.com.au/apps/viewDocument.ac?docID=SMH020520EV3D919FILN");
// getNothmanData(args[0]);
// newsArticleMiner.testSanityOfCorpus();
// newsArticleMiner.getCorpusStats();
// int c = 0;
// BufferedReader br = new BufferedReader(new FileReader(new File(
// "inIndexAndPopularNews")));
// String line;
// Citation citation = null;
// while ((line = br.readLine()) != null) {
// citation = Citation.gsonInstance.fromJson(line, Citation.class);
// createNewsArticle(citation.getHash(),citation.getURL());
// if(c++ % 100 ==0)
// {
// System.out.println("Done "+c);
// }
// }
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment