Skip to content
Snippets Groups Projects
Commit 342d0622 authored by Shyam Upadhyay's avatar Shyam Upadhyay
Browse files

dkpro added

parent 26f435bb
No related branches found
No related tags found
No related merge requests found
......@@ -24,6 +24,12 @@
<dependencies>
<dependency>
<groupId>de.tudarmstadt.ukp.wikipedia</groupId>
<artifactId>de.tudarmstadt.ukp.wikipedia.revisionmachine</artifactId>
<version>1.0.0</version>
</dependency>
<dependency>
<groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId>
......
......@@ -34,5 +34,6 @@ MAIN="$PACKAGE_PREFIX.wiki.events.newsarticle.NewsArticleMiner"
#MAIN="$PACKAGE_PREFIX.wiki.parsing.pageviews.IndexPageViews"
#MAIN="$PACKAGE_PREFIX.wiki.parsing.pageviews.PageViewIndexer"
#MAIN="$PACKAGE_PREFIX.wiki.parsing.pageviews.WikiClusters"
MAIN="de.tudarmstadt.ukp.wikipedia.revisionmachine.difftool.config.gui.ConfigGUI"
#MAIN="de.tudarmstadt.ukp.wikipedia.revisionmachine.difftool.DiffTool"
time nice java $OPTIONS $MAIN $CONFIG_STR $*
import os
import sys
import argparse
import urllib
from bs4 import BeautifulSoup, SoupStrainer
import httplib
import httplib
from urlparse import urlparse
# creates a file with urls, wikiurls.txt, which contains the download urls for all wikipedias (all languages)
# once this script generates wikiurls.txt, you can
# wget --no-clobber -i wikiurls.txt
# wiki.html is the html page "https://dumps.wikimedia.org/backup-index.html"
def checkUrl(url):
"checks if the url is dead or not"
p = urlparse(url)
conn = httplib.HTTPConnection(p.netloc)
conn.request('HEAD', p.path)
resp = conn.getresponse()
return resp.status < 400
import httplib2
from BeautifulSoup import BeautifulSoup, SoupStrainer
http = httplib2.Http()
status, response = http.request('https://dumps.wikimedia.org/enwiki/20150805/')
for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')):
if "pages-meta-history" in link["href"]:
print "https://dumps.wikimedia.org/"+link["href"]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment