Commit 342d0622 authored by Shyam Upadhyay's avatar Shyam Upadhyay
Browse files

dkpro added

parent 26f435bb
...@@ -24,6 +24,12 @@ ...@@ -24,6 +24,12 @@
<dependencies> <dependencies>
<dependency>
<groupId>de.tudarmstadt.ukp.wikipedia</groupId>
<artifactId>de.tudarmstadt.ukp.wikipedia.revisionmachine</artifactId>
<version>1.0.0</version>
</dependency>
<dependency> <dependency>
<groupId>org.jsoup</groupId> <groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId> <artifactId>jsoup</artifactId>
......
...@@ -34,5 +34,6 @@ MAIN="$PACKAGE_PREFIX.wiki.events.newsarticle.NewsArticleMiner" ...@@ -34,5 +34,6 @@ MAIN="$PACKAGE_PREFIX.wiki.events.newsarticle.NewsArticleMiner"
#MAIN="$PACKAGE_PREFIX.wiki.parsing.pageviews.IndexPageViews" #MAIN="$PACKAGE_PREFIX.wiki.parsing.pageviews.IndexPageViews"
#MAIN="$PACKAGE_PREFIX.wiki.parsing.pageviews.PageViewIndexer" #MAIN="$PACKAGE_PREFIX.wiki.parsing.pageviews.PageViewIndexer"
#MAIN="$PACKAGE_PREFIX.wiki.parsing.pageviews.WikiClusters" #MAIN="$PACKAGE_PREFIX.wiki.parsing.pageviews.WikiClusters"
MAIN="de.tudarmstadt.ukp.wikipedia.revisionmachine.difftool.config.gui.ConfigGUI"
#MAIN="de.tudarmstadt.ukp.wikipedia.revisionmachine.difftool.DiffTool"
time nice java $OPTIONS $MAIN $CONFIG_STR $* time nice java $OPTIONS $MAIN $CONFIG_STR $*
import os
import sys
import argparse
import urllib
from bs4 import BeautifulSoup, SoupStrainer
import httplib
import httplib
from urlparse import urlparse
# creates a file with urls, wikiurls.txt, which contains the download urls for all wikipedias (all languages)
# once this script generates wikiurls.txt, you can
# wget --no-clobber -i wikiurls.txt
# wiki.html is the html page "https://dumps.wikimedia.org/backup-index.html"
def checkUrl(url):
"checks if the url is dead or not"
p = urlparse(url)
conn = httplib.HTTPConnection(p.netloc)
conn.request('HEAD', p.path)
resp = conn.getresponse()
return resp.status < 400
import httplib2
from BeautifulSoup import BeautifulSoup, SoupStrainer
http = httplib2.Http()
status, response = http.request('https://dumps.wikimedia.org/enwiki/20150805/')
for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')):
if "pages-meta-history" in link["href"]:
print "https://dumps.wikimedia.org/"+link["href"]
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment