Skip to content
Snippets Groups Projects
Commit 26f435bb authored by Shyam Upadhyay's avatar Shyam Upadhyay
Browse files

WIP

parent f9b0acb6
No related branches found
No related tags found
No related merge requests found
......@@ -2,25 +2,61 @@
import csv
import sys
from collections import Counter
from urlparse import urlparse
f=open(sys.argv[1])
data=csv.reader(f,delimiter='\t')
cnt=Counter()
url_cnt=Counter()
cat_cnt=Counter()
url_cat_cnt=Counter()
page_cnt=Counter()
bad=0 # bad due to race print conditions, should not be too much
def processURL(url):
parsed_uri = urlparse(url)
domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
# if "///" in domain:
# print url
return domain
def isValid(raw_url,page,outlinks,cats):
if raw_url=="null":
return False
if outlinks < 100:
return False
return True
for row in data:
# print row
for i in row:
idx=i.find(':')
name=i[0:idx]
val=i[idx+1:]
if name=="URL":
if val!="null":
print val
val=val.strip("http://")
jj=val.find('/')
# print val[0:jj]
cnt.update((val[0:jj],))
else:
print "NULL"
for i in cnt.most_common(int(sys.argv[2])):
print i
if len(row)==4:
raw_url=row[0]
page=row[1]
outlinks=int(row[2])
cats=row[3].strip('[').strip(']').split(',')
cats = map(lambda x: x.strip(),cats)
if not isValid(raw_url,page,outlinks,cats):
continue
url=processURL(raw_url)
page_cnt.update((page,))
# print cats
cat_cnt.update(cats)
url_cnt.update((url,))
for cat in cats:
url_cat_cnt.update((url+"#"+cat,))
else:
# print "BAD",row
bad+=1
print "botched due to parallel",bad
# for i in url_cnt.most_common(int(sys.argv[2])):
# print i
# for i in cat_cnt.most_common(int(sys.argv[2])):
# print i
for i in page_cnt.most_common(10):
print i,page_cnt[i]
# for i in url_cat_cnt:
# print i,url_cat_cnt[i]
......@@ -24,7 +24,7 @@ MAIN="$PACKAGE_PREFIX.wiki.events.CiteMiner"
#MAIN="$PACKAGE_PREFIX.wiki.events.TrainingDataGenerator"
#MAIN="$PACKAGE_PREFIX.wiki.events.baseline.Baseline"
MAIN="$PACKAGE_PREFIX.wiki.events.newsarticle.NewsArticleMiner"
MAIN="$PACKAGE_PREFIX.wiki.events.newsarticle.ReadNothmanAnnotation"
# MAIN="$PACKAGE_PREFIX.wiki.events.newsarticle.ReadNothmanAnnotation"
#MAIN="$PACKAGE_PREFIX.wiki.events.newsarticle.CleanData"
#MAIN="$PACKAGE_PREFIX.wiki.parsing.indexing.simplewiki.SimpleWikipageIndexer"
#MAIN="$PACKAGE_PREFIX.wiki.parsing.indexing.simplewiki.IndexSimpleWikipedia"
......
......@@ -10,6 +10,7 @@ import edu.illinois.cs.cogcomp.wiki.events.indexing.CitationIndexer;
public class ArticleWorker implements Runnable {
private List<String> jobs;
private String dumpath;
@Override
public void run() {
......@@ -22,7 +23,7 @@ public class ArticleWorker implements Runnable {
+ line;
trial++;
try {
String filepath = "nothman-data/newsbusiness" + "/" + md5;
String filepath = dumpath + "/" + md5;
if (NewsArticleMiner.dumpNewsArticle(filepath, urlstr)) {
hit++;
} else {
......@@ -40,7 +41,8 @@ public class ArticleWorker implements Runnable {
}
}
public ArticleWorker(List<String> jobs) {
public ArticleWorker(List<String> jobs, String dumpath) {
this.jobs = jobs;
this.dumpath=dumpath;
}
}
......@@ -44,9 +44,11 @@ import edu.illinois.cs.cogcomp.wikifier.utils.datastructure.StringCounter;
public class NewsArticleMiner {
private String idfilepath;
private String dumpath;
public NewsArticleMiner(String idfilepath) {
public NewsArticleMiner(String idfilepath, String dumpath) {
this.idfilepath = idfilepath;
this.dumpath =dumpath;
}
public static String fetchArticleFromWeb(String urlstr)
......@@ -62,44 +64,47 @@ public class NewsArticleMiner {
}
}
public static String fetchHTMLArticleFromWeb(String urlstr)
throws MalformedURLException {
URL url;
InputStream is = null;
BufferedReader br;
String line;
String content="";
InputStream is = null;
BufferedReader br;
String line;
String content = "";
try {
url = new URL(urlstr);
is = url.openStream(); // throws an IOException
br = new BufferedReader(new InputStreamReader(is));
while ((line = br.readLine()) != null) {
System.out.println(line);
content+=line+"\n";
}
} catch (MalformedURLException mue) {
mue.printStackTrace();
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
try {
if (is != null) is.close();
} catch (IOException ioe) {
// nothing to see here
}
}
url = new URL(urlstr);
is = url.openStream(); // throws an IOException
br = new BufferedReader(new InputStreamReader(is));
while ((line = br.readLine()) != null) {
System.out.println(line);
content += line + "\n";
}
} catch (MalformedURLException mue) {
mue.printStackTrace();
} catch (IOException ioe) {
ioe.printStackTrace();
} finally {
try {
if (is != null)
is.close();
} catch (IOException ioe) {
// nothing to see here
}
}
return content;
}
static int empty = 0;
public static boolean dumpNewsArticle(String filepath, String url)
throws FileNotFoundException, MalformedURLException,
BoilerpipeProcessingException {
// String filepath = Experiments.datapath + "/" + md5;
// String filepath = Experiments.datapath + "/" + md5;
if (!IOUtils.exists(filepath)) {
// String content = fetchArticleFromWeb(url);
String content = fetchHTMLArticleFromWeb(url);
String content = fetchArticleFromWeb(url);
// String content = fetchHTMLArticleFromWeb(url);
if (content != null && content.trim().length() >= 20) {
PrintWriter w = new PrintWriter(new File(filepath));
w.println(content);
......@@ -133,7 +138,7 @@ public class NewsArticleMiner {
List<String> subList = lines.subList(i
* (lines.size() / NUM_THREADS), ((i + 1)
* (lines.size() / NUM_THREADS) - 1));
ArticleWorker articleWorker = new ArticleWorker(subList);
ArticleWorker articleWorker = new ArticleWorker(subList,dumpath);
executor.submit(articleWorker);
}
executor.shutdown();
......@@ -167,43 +172,45 @@ public class NewsArticleMiner {
System.out.println(counter.getTopK(10));
}
public static void getNothmanData(String idsFile) throws FileNotFoundException, MalformedURLException
{
public static void getNothmanData(String idsFile)
throws FileNotFoundException, MalformedURLException {
List<String> urllist = LineIO.read(idsFile);
String title;
for(int i=0;i<10;i++)
{
for (int i = 0; i < 10; i++) {
System.out.println(i);
title=urllist.get(i);
PrintWriter w = new PrintWriter("nothman-data/"+title);
w.println(NewsArticleMiner.fetchArticleFromWeb("http://newsstore.smh.com.au/apps/viewDocument.ac?docID="+title));
title = urllist.get(i);
PrintWriter w = new PrintWriter("nothman-data/" + title);
w.println(NewsArticleMiner
.fetchArticleFromWeb("http://newsstore.smh.com.au/apps/viewDocument.ac?docID="
+ title));
w.close();
}
}
public static void main(String[] args)
throws BoilerpipeProcessingException, IOException {
// NewsArticleMiner miner= new NewsArticleMiner(args[0]);
// miner.run();
dumpNewsArticle("nothman-data/test.html","http://newsstore.smh.com.au/apps/viewDocument.ac?docID=SMH020520EV3D919FILN");
//getNothmanData(args[0]);
// newsArticleMiner.testSanityOfCorpus();
// newsArticleMiner.getCorpusStats();
// int c = 0;
// BufferedReader br = new BufferedReader(new FileReader(new File(
// "inIndexAndPopularNews")));
// String line;
// Citation citation = null;
// while ((line = br.readLine()) != null) {
// citation = Citation.gsonInstance.fromJson(line, Citation.class);
// createNewsArticle(citation.getHash(),citation.getURL());
// if(c++ % 100 ==0)
// {
// System.out.println("Done "+c);
// }
// }
NewsArticleMiner miner = new NewsArticleMiner(args[0],args[1]);
miner.run();
// dumpNewsArticle("nothman-data/test.html",
// "http://newsstore.smh.com.au/apps/viewDocument.ac?docID=SMH020520EV3D919FILN");
// getNothmanData(args[0]);
// newsArticleMiner.testSanityOfCorpus();
// newsArticleMiner.getCorpusStats();
// int c = 0;
// BufferedReader br = new BufferedReader(new FileReader(new File(
// "inIndexAndPopularNews")));
// String line;
// Citation citation = null;
// while ((line = br.readLine()) != null) {
// citation = Citation.gsonInstance.fromJson(line, Citation.class);
// createNewsArticle(citation.getHash(),citation.getURL());
// if(c++ % 100 ==0)
// {
// System.out.println("Done "+c);
// }
// }
}
}
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment