dkpro added

342d0622 · Shyam Upadhyay · 26f435bb · 342d0622 · 342d0622 · 342d0622
Commit 342d0622 authored 9 years ago by Shyam Upadhyay
--- a/pom.xml
+++ b/pom.xml
@@ -24,6 +24,12 @@
  <dependencies>
+    <dependency>
+      <groupId>de.tudarmstadt.ukp.wikipedia</groupId>
+      <artifactId>de.tudarmstadt.ukp.wikipedia.revisionmachine</artifactId>
+      <version>1.0.0</version>
+    </dependency>
    <dependency>
      <groupId>org.jsoup</groupId>
      <artifactId>jsoup</artifactId>

--- a/run.sh
+++ b/run.sh
@@ -34,5 +34,6 @@ MAIN="$PACKAGE_PREFIX.wiki.events.newsarticle.NewsArticleMiner"
 #MAIN="$PACKAGE_PREFIX.wiki.parsing.pageviews.IndexPageViews"
 #MAIN="$PACKAGE_PREFIX.wiki.parsing.pageviews.PageViewIndexer"
 #MAIN="$PACKAGE_PREFIX.wiki.parsing.pageviews.WikiClusters"
+MAIN="de.tudarmstadt.ukp.wikipedia.revisionmachine.difftool.config.gui.ConfigGUI"
+#MAIN="de.tudarmstadt.ukp.wikipedia.revisionmachine.difftool.DiffTool"
 time nice java $OPTIONS $MAIN $CONFIG_STR $*
--- a/scripts/grabWikiUrls.py
+++ b/scripts/grabWikiUrls.py
+import os
+import sys
+import argparse
+import urllib
+from bs4 import BeautifulSoup, SoupStrainer
+import httplib
+import httplib
+from urlparse import urlparse
+# creates a file with urls, wikiurls.txt, which contains the download urls for all wikipedias (all languages)
+# once this script generates wikiurls.txt, you can
+# wget --no-clobber -i wikiurls.txt
+# wiki.html is the html page "https://dumps.wikimedia.org/backup-index.html"
+def checkUrl(url):
+            "checks if the url is dead or not"
+            p = urlparse(url)
+            conn = httplib.HTTPConnection(p.netloc)
+            conn.request('HEAD', p.path)
+            resp = conn.getresponse()
+            return resp.status < 400
+import httplib2
+from BeautifulSoup import BeautifulSoup, SoupStrainer
+http = httplib2.Http()
+status, response = http.request('https://dumps.wikimedia.org/enwiki/20150805/')
+for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')):
+            if "pages-meta-history" in link["href"]:
+                        print "https://dumps.wikimedia.org/"+link["href"]