...@@ -24,6 +24,12 @@ ...@@ -24,6 +24,12 @@
<dependencies> <dependencies>
<dependency> <dependency>
<groupId>org.jsoup</groupId> <groupId>org.jsoup</groupId>
<artifactId>jsoup</artifactId> <artifactId>jsoup</artifactId>
...@@ -34,5 +34,6 @@ MAIN="$" ...@@ -34,5 +34,6 @@ MAIN="$"
#MAIN="$" #MAIN="$"
#MAIN="$" #MAIN="$"
#MAIN="$" #MAIN="$"
time nice java $OPTIONS $MAIN $CONFIG_STR $* time nice java $OPTIONS $MAIN $CONFIG_STR $*
import os
import sys
import argparse
import urllib
from bs4 import BeautifulSoup, SoupStrainer
import httplib
from urlparse import urlparse
# creates a file with urls, wikiurls.txt, which contains the download urls for all wikipedias (all languages)
# once this script generates wikiurls.txt, you can
# wget --no-clobber -i wikiurls.txt
# wiki.html is the html page ""
def checkUrl(url):
"checks if the url is dead or not"
p = urlparse(url)
conn = httplib.HTTPConnection(p.netloc)
conn.request('HEAD', p.path)
resp = conn.getresponse()
return resp.status < 400
import httplib2
from BeautifulSoup import BeautifulSoup, SoupStrainer
http = httplib2.Http()
status, response = http.request('')
for link in BeautifulSoup(response, parseOnlyThese=SoupStrainer('a')):
if "pages-meta-history" in link["href"]:
print ""+link["href"]
