WIP

26f435bb · Shyam Upadhyay · f9b0acb6 · 26f435bb · 26f435bb · 26f435bb
Commit 26f435bb authored 9 years ago by Shyam Upadhyay
--- a/compute_counts.py
+++ b/compute_counts.py
@@ -2,25 +2,61 @@
 import csv
 import sys
 from collections import Counter
+from urlparse import urlparse

 f=open(sys.argv[1])
 data=csv.reader(f,delimiter='\t')
-cnt=Counter()
+url_cnt=Counter()
+cat_cnt=Counter()
+url_cat_cnt=Counter()
+page_cnt=Counter()
+bad=0                           # bad due to race print conditions, should not be too much
+
+def processURL(url):
+    parsed_uri = urlparse(url)
+    domain = '{uri.scheme}://{uri.netloc}/'.format(uri=parsed_uri)
+    # if "///" in domain:
+    #     print url
+    return domain
+
+def isValid(raw_url,page,outlinks,cats):
+    if raw_url=="null":
+        return False
+    if outlinks < 100:
+        return False
+    return True
 for row in data:
-    # print row
-    for i in row:
-        idx=i.find(':')
-        name=i[0:idx]
-        val=i[idx+1:]
-        if name=="URL":
-            if val!="null":
-                print val
-                val=val.strip("http://")
-                jj=val.find('/')
-                # print val[0:jj]
-                cnt.update((val[0:jj],))
-            else:
-                print "NULL"
-
-for i in cnt.most_common(int(sys.argv[2])):
-    print i
+    if len(row)==4:
+        raw_url=row[0]
+        page=row[1]
+        outlinks=int(row[2])
+        cats=row[3].strip('[').strip(']').split(',')
+        cats = map(lambda x: x.strip(),cats)
+
+        if not isValid(raw_url,page,outlinks,cats):
+            continue
+
+        url=processURL(raw_url)
+        page_cnt.update((page,))
+        # print cats
+        cat_cnt.update(cats)
+        url_cnt.update((url,))
+        for cat in cats:
+            url_cat_cnt.update((url+"#"+cat,))
+    else:
+        # print "BAD",row
+        bad+=1
+
+print "botched due to parallel",bad
+
+# for i in url_cnt.most_common(int(sys.argv[2])):
+#     print i
+# for i in cat_cnt.most_common(int(sys.argv[2])):
+#     print i
+
+for i in page_cnt.most_common(10):
+    print i,page_cnt[i]
+
+# for i in url_cat_cnt:
+#     print i,url_cat_cnt[i]
+
--- a/run.sh
+++ b/run.sh
@@ -24,7 +24,7 @@ MAIN="$PACKAGE_PREFIX.wiki.events.CiteMiner"
 #MAIN="$PACKAGE_PREFIX.wiki.events.TrainingDataGenerator"
 #MAIN="$PACKAGE_PREFIX.wiki.events.baseline.Baseline"
 MAIN="$PACKAGE_PREFIX.wiki.events.newsarticle.NewsArticleMiner"
-MAIN="$PACKAGE_PREFIX.wiki.events.newsarticle.ReadNothmanAnnotation"
+# MAIN="$PACKAGE_PREFIX.wiki.events.newsarticle.ReadNothmanAnnotation"
 #MAIN="$PACKAGE_PREFIX.wiki.events.newsarticle.CleanData"
 #MAIN="$PACKAGE_PREFIX.wiki.parsing.indexing.simplewiki.SimpleWikipageIndexer"
 #MAIN="$PACKAGE_PREFIX.wiki.parsing.indexing.simplewiki.IndexSimpleWikipedia"

--- a/src/main/java/edu/illinois/cs/cogcomp/wiki/events/newsarticle/ArticleWorker.java
+++ b/src/main/java/edu/illinois/cs/cogcomp/wiki/events/newsarticle/ArticleWorker.java
@@ -10,6 +10,7 @@ import edu.illinois.cs.cogcomp.wiki.events.indexing.CitationIndexer;
 public class ArticleWorker implements Runnable {

 	private List<String> jobs;
+	private String dumpath;

 	@Override
 	public void run() {
@@ -22,7 +23,7 @@ public class ArticleWorker implements Runnable {
 					+ line;
 			trial++;
 			try {
-				String filepath = "nothman-data/newsbusiness" + "/" + md5;
+				String filepath = dumpath + "/" + md5;
 				if (NewsArticleMiner.dumpNewsArticle(filepath, urlstr)) {
 					hit++;
 				} else {
@@ -40,7 +41,8 @@ public class ArticleWorker implements Runnable {
 		}
 	}

-	public ArticleWorker(List<String> jobs) {
+	public ArticleWorker(List<String> jobs, String dumpath) {
 		this.jobs = jobs;
+		this.dumpath=dumpath;
 	}
 }
--- a/src/main/java/edu/illinois/cs/cogcomp/wiki/events/newsarticle/NewsArticleMiner.java
+++ b/src/main/java/edu/illinois/cs/cogcomp/wiki/events/newsarticle/NewsArticleMiner.java
@@ -44,9 +44,11 @@ import edu.illinois.cs.cogcomp.wikifier.utils.datastructure.StringCounter;
 public class NewsArticleMiner {

 	private String idfilepath;
+	private String dumpath;

-	public NewsArticleMiner(String idfilepath) {
+	public NewsArticleMiner(String idfilepath, String dumpath) {
 		this.idfilepath = idfilepath;
+		this.dumpath =dumpath;
 	}

 	public static String fetchArticleFromWeb(String urlstr)
@@ -62,44 +64,47 @@ public class NewsArticleMiner {
 		}

 	}
+
 	public static String fetchHTMLArticleFromWeb(String urlstr)
 			throws MalformedURLException {
 		URL url;
-	    InputStream is = null;
-	    BufferedReader br;
-	    String line;
-	    String content="";
+		InputStream is = null;
+		BufferedReader br;
+		String line;
+		String content = "";
 		try {
-	        url = new URL(urlstr);
-	        is = url.openStream();  // throws an IOException
-	        br = new BufferedReader(new InputStreamReader(is));
-
-	        while ((line = br.readLine()) != null) {
-	            System.out.println(line);
-	            content+=line+"\n";
-	        }
-	    } catch (MalformedURLException mue) {
-	         mue.printStackTrace();
-	    } catch (IOException ioe) {
-	         ioe.printStackTrace();
-	    } finally {
-	        try {
-	            if (is != null) is.close();
-	        } catch (IOException ioe) {
-	            // nothing to see here
-	        }
-	    }
+			url = new URL(urlstr);
+			is = url.openStream(); // throws an IOException
+			br = new BufferedReader(new InputStreamReader(is));
+
+			while ((line = br.readLine()) != null) {
+				System.out.println(line);
+				content += line + "\n";
+			}
+		} catch (MalformedURLException mue) {
+			mue.printStackTrace();
+		} catch (IOException ioe) {
+			ioe.printStackTrace();
+		} finally {
+			try {
+				if (is != null)
+					is.close();
+			} catch (IOException ioe) {
+				// nothing to see here
+			}
+		}
 		return content;
 	}
+
 	static int empty = 0;

 	public static boolean dumpNewsArticle(String filepath, String url)
 			throws FileNotFoundException, MalformedURLException,
 			BoilerpipeProcessingException {
-//		String filepath = Experiments.datapath + "/" + md5;
+		// String filepath = Experiments.datapath + "/" + md5;
 		if (!IOUtils.exists(filepath)) {
-//			String content = fetchArticleFromWeb(url);
-			String content = fetchHTMLArticleFromWeb(url);
+			String content = fetchArticleFromWeb(url);
+			// String content = fetchHTMLArticleFromWeb(url);
 			if (content != null && content.trim().length() >= 20) {
 				PrintWriter w = new PrintWriter(new File(filepath));
 				w.println(content);
@@ -133,7 +138,7 @@ public class NewsArticleMiner {
 			List<String> subList = lines.subList(i
 					* (lines.size() / NUM_THREADS), ((i + 1)
 					* (lines.size() / NUM_THREADS) - 1));
-			ArticleWorker articleWorker = new ArticleWorker(subList);
+			ArticleWorker articleWorker = new ArticleWorker(subList,dumpath);
 			executor.submit(articleWorker);
 		}
 		executor.shutdown();
@@ -167,43 +172,45 @@ public class NewsArticleMiner {
 		System.out.println(counter.getTopK(10));
 	}

-	public static void getNothmanData(String idsFile) throws FileNotFoundException, MalformedURLException
-	{
+	public static void getNothmanData(String idsFile)
+			throws FileNotFoundException, MalformedURLException {
 		List<String> urllist = LineIO.read(idsFile);
 		String title;
-		for(int i=0;i<10;i++)
-		{
+		for (int i = 0; i < 10; i++) {
 			System.out.println(i);
-			title=urllist.get(i);
-			PrintWriter w = new PrintWriter("nothman-data/"+title);
-			w.println(NewsArticleMiner.fetchArticleFromWeb("http://newsstore.smh.com.au/apps/viewDocument.ac?docID="+title));
+			title = urllist.get(i);
+			PrintWriter w = new PrintWriter("nothman-data/" + title);
+			w.println(NewsArticleMiner
+					.fetchArticleFromWeb("http://newsstore.smh.com.au/apps/viewDocument.ac?docID="
+							+ title));
 			w.close();

 		}
 	}
-	
+
 	public static void main(String[] args)
 			throws BoilerpipeProcessingException, IOException {
-//		NewsArticleMiner miner= new NewsArticleMiner(args[0]);
-//		miner.run();
-		dumpNewsArticle("nothman-data/test.html","http://newsstore.smh.com.au/apps/viewDocument.ac?docID=SMH020520EV3D919FILN");
-		
-		//getNothmanData(args[0]);
-//		newsArticleMiner.testSanityOfCorpus();
-//		newsArticleMiner.getCorpusStats();
-//		int c = 0;
-//		BufferedReader br = new BufferedReader(new FileReader(new File(
-//				"inIndexAndPopularNews")));
-//		String line;
-//		Citation citation = null;
-//		while ((line = br.readLine()) != null) {
-//			citation = Citation.gsonInstance.fromJson(line, Citation.class);
-//			createNewsArticle(citation.getHash(),citation.getURL());
-//			if(c++ % 100 ==0)
-//			{
-//				System.out.println("Done "+c);
-//			}
-//		}
-		
+		NewsArticleMiner miner = new NewsArticleMiner(args[0],args[1]);
+		miner.run();
+		// dumpNewsArticle("nothman-data/test.html",
+		// "http://newsstore.smh.com.au/apps/viewDocument.ac?docID=SMH020520EV3D919FILN");
+
+		// getNothmanData(args[0]);
+		// newsArticleMiner.testSanityOfCorpus();
+		// newsArticleMiner.getCorpusStats();
+		// int c = 0;
+		// BufferedReader br = new BufferedReader(new FileReader(new File(
+		// "inIndexAndPopularNews")));
+		// String line;
+		// Citation citation = null;
+		// while ((line = br.readLine()) != null) {
+		// citation = Citation.gsonInstance.fromJson(line, Citation.class);
+		// createNewsArticle(citation.getHash(),citation.getURL());
+		// if(c++ % 100 ==0)
+		// {
+		// System.out.println("Done "+c);
+		// }
+		// }
+
 	}
 }