Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Shyam Upadhyay
event-wikifier
Commits
26f435bb
Commit
26f435bb
authored
Sep 04, 2015
by
Shyam Upadhyay
Browse files
WIP
parent
f9b0acb6
Changes
4
Hide whitespace changes
Inline
Side-by-side
compute_counts.py
View file @
26f435bb
...
...
@@ -2,25 +2,61 @@
import
csv
import
sys
from
collections
import
Counter
from
urlparse
import
urlparse
f
=
open
(
sys
.
argv
[
1
])
data
=
csv
.
reader
(
f
,
delimiter
=
'
\t
'
)
cnt
=
Counter
()
url_cnt
=
Counter
()
cat_cnt
=
Counter
()
url_cat_cnt
=
Counter
()
page_cnt
=
Counter
()
bad
=
0
# bad due to race print conditions, should not be too much
def
processURL
(
url
):
parsed_uri
=
urlparse
(
url
)
domain
=
'{uri.scheme}://{uri.netloc}/'
.
format
(
uri
=
parsed_uri
)
# if "///" in domain:
# print url
return
domain
def
isValid
(
raw_url
,
page
,
outlinks
,
cats
):
if
raw_url
==
"null"
:
return
False
if
outlinks
<
100
:
return
False
return
True
for
row
in
data
:
# print row
for
i
in
row
:
idx
=
i
.
find
(
':'
)
name
=
i
[
0
:
idx
]
val
=
i
[
idx
+
1
:]
if
name
==
"URL"
:
if
val
!=
"null"
:
print
val
val
=
val
.
strip
(
"http://"
)
jj
=
val
.
find
(
'/'
)
# print val[0:jj]
cnt
.
update
((
val
[
0
:
jj
],))
else
:
print
"NULL"
for
i
in
cnt
.
most_common
(
int
(
sys
.
argv
[
2
])):
print
i
if
len
(
row
)
==
4
:
raw_url
=
row
[
0
]
page
=
row
[
1
]
outlinks
=
int
(
row
[
2
])
cats
=
row
[
3
].
strip
(
'['
).
strip
(
']'
).
split
(
','
)
cats
=
map
(
lambda
x
:
x
.
strip
(),
cats
)
if
not
isValid
(
raw_url
,
page
,
outlinks
,
cats
):
continue
url
=
processURL
(
raw_url
)
page_cnt
.
update
((
page
,))
# print cats
cat_cnt
.
update
(
cats
)
url_cnt
.
update
((
url
,))
for
cat
in
cats
:
url_cat_cnt
.
update
((
url
+
"#"
+
cat
,))
else
:
# print "BAD",row
bad
+=
1
print
"botched due to parallel"
,
bad
# for i in url_cnt.most_common(int(sys.argv[2])):
# print i
# for i in cat_cnt.most_common(int(sys.argv[2])):
# print i
for
i
in
page_cnt
.
most_common
(
10
):
print
i
,
page_cnt
[
i
]
# for i in url_cat_cnt:
# print i,url_cat_cnt[i]
run.sh
View file @
26f435bb
...
...
@@ -24,7 +24,7 @@ MAIN="$PACKAGE_PREFIX.wiki.events.CiteMiner"
#MAIN="$PACKAGE_PREFIX.wiki.events.TrainingDataGenerator"
#MAIN="$PACKAGE_PREFIX.wiki.events.baseline.Baseline"
MAIN
=
"
$PACKAGE_PREFIX
.wiki.events.newsarticle.NewsArticleMiner"
MAIN
=
"
$PACKAGE_PREFIX
.wiki.events.newsarticle.ReadNothmanAnnotation"
#
MAIN="$PACKAGE_PREFIX.wiki.events.newsarticle.ReadNothmanAnnotation"
#MAIN="$PACKAGE_PREFIX.wiki.events.newsarticle.CleanData"
#MAIN="$PACKAGE_PREFIX.wiki.parsing.indexing.simplewiki.SimpleWikipageIndexer"
#MAIN="$PACKAGE_PREFIX.wiki.parsing.indexing.simplewiki.IndexSimpleWikipedia"
...
...
src/main/java/edu/illinois/cs/cogcomp/wiki/events/newsarticle/ArticleWorker.java
View file @
26f435bb
...
...
@@ -10,6 +10,7 @@ import edu.illinois.cs.cogcomp.wiki.events.indexing.CitationIndexer;
public
class
ArticleWorker
implements
Runnable
{
private
List
<
String
>
jobs
;
private
String
dumpath
;
@Override
public
void
run
()
{
...
...
@@ -22,7 +23,7 @@ public class ArticleWorker implements Runnable {
+
line
;
trial
++;
try
{
String
filepath
=
"nothman-data/newsbusiness"
+
"/"
+
md5
;
String
filepath
=
dumpath
+
"/"
+
md5
;
if
(
NewsArticleMiner
.
dumpNewsArticle
(
filepath
,
urlstr
))
{
hit
++;
}
else
{
...
...
@@ -40,7 +41,8 @@ public class ArticleWorker implements Runnable {
}
}
public
ArticleWorker
(
List
<
String
>
jobs
)
{
public
ArticleWorker
(
List
<
String
>
jobs
,
String
dumpath
)
{
this
.
jobs
=
jobs
;
this
.
dumpath
=
dumpath
;
}
}
src/main/java/edu/illinois/cs/cogcomp/wiki/events/newsarticle/NewsArticleMiner.java
View file @
26f435bb
...
...
@@ -44,9 +44,11 @@ import edu.illinois.cs.cogcomp.wikifier.utils.datastructure.StringCounter;
public
class
NewsArticleMiner
{
private
String
idfilepath
;
private
String
dumpath
;
public
NewsArticleMiner
(
String
idfilepath
)
{
public
NewsArticleMiner
(
String
idfilepath
,
String
dumpath
)
{
this
.
idfilepath
=
idfilepath
;
this
.
dumpath
=
dumpath
;
}
public
static
String
fetchArticleFromWeb
(
String
urlstr
)
...
...
@@ -62,44 +64,47 @@ public class NewsArticleMiner {
}
}
public
static
String
fetchHTMLArticleFromWeb
(
String
urlstr
)
throws
MalformedURLException
{
URL
url
;
InputStream
is
=
null
;
BufferedReader
br
;
String
line
;
String
content
=
""
;
InputStream
is
=
null
;
BufferedReader
br
;
String
line
;
String
content
=
""
;
try
{
url
=
new
URL
(
urlstr
);
is
=
url
.
openStream
();
// throws an IOException
br
=
new
BufferedReader
(
new
InputStreamReader
(
is
));
while
((
line
=
br
.
readLine
())
!=
null
)
{
System
.
out
.
println
(
line
);
content
+=
line
+
"\n"
;
}
}
catch
(
MalformedURLException
mue
)
{
mue
.
printStackTrace
();
}
catch
(
IOException
ioe
)
{
ioe
.
printStackTrace
();
}
finally
{
try
{
if
(
is
!=
null
)
is
.
close
();
}
catch
(
IOException
ioe
)
{
// nothing to see here
}
}
url
=
new
URL
(
urlstr
);
is
=
url
.
openStream
();
// throws an IOException
br
=
new
BufferedReader
(
new
InputStreamReader
(
is
));
while
((
line
=
br
.
readLine
())
!=
null
)
{
System
.
out
.
println
(
line
);
content
+=
line
+
"\n"
;
}
}
catch
(
MalformedURLException
mue
)
{
mue
.
printStackTrace
();
}
catch
(
IOException
ioe
)
{
ioe
.
printStackTrace
();
}
finally
{
try
{
if
(
is
!=
null
)
is
.
close
();
}
catch
(
IOException
ioe
)
{
// nothing to see here
}
}
return
content
;
}
static
int
empty
=
0
;
public
static
boolean
dumpNewsArticle
(
String
filepath
,
String
url
)
throws
FileNotFoundException
,
MalformedURLException
,
BoilerpipeProcessingException
{
//
String filepath = Experiments.datapath + "/" + md5;
//
String filepath = Experiments.datapath + "/" + md5;
if
(!
IOUtils
.
exists
(
filepath
))
{
//
String content = fetchArticleFromWeb(url);
String
content
=
fetchHTMLArticleFromWeb
(
url
);
String
content
=
fetchArticleFromWeb
(
url
);
//
String content = fetchHTMLArticleFromWeb(url);
if
(
content
!=
null
&&
content
.
trim
().
length
()
>=
20
)
{
PrintWriter
w
=
new
PrintWriter
(
new
File
(
filepath
));
w
.
println
(
content
);
...
...
@@ -133,7 +138,7 @@ public class NewsArticleMiner {
List
<
String
>
subList
=
lines
.
subList
(
i
*
(
lines
.
size
()
/
NUM_THREADS
),
((
i
+
1
)
*
(
lines
.
size
()
/
NUM_THREADS
)
-
1
));
ArticleWorker
articleWorker
=
new
ArticleWorker
(
subList
);
ArticleWorker
articleWorker
=
new
ArticleWorker
(
subList
,
dumpath
);
executor
.
submit
(
articleWorker
);
}
executor
.
shutdown
();
...
...
@@ -167,43 +172,45 @@ public class NewsArticleMiner {
System
.
out
.
println
(
counter
.
getTopK
(
10
));
}
public
static
void
getNothmanData
(
String
idsFile
)
throws
FileNotFoundException
,
MalformedURLException
{
public
static
void
getNothmanData
(
String
idsFile
)
throws
FileNotFoundException
,
MalformedURLException
{
List
<
String
>
urllist
=
LineIO
.
read
(
idsFile
);
String
title
;
for
(
int
i
=
0
;
i
<
10
;
i
++)
{
for
(
int
i
=
0
;
i
<
10
;
i
++)
{
System
.
out
.
println
(
i
);
title
=
urllist
.
get
(
i
);
PrintWriter
w
=
new
PrintWriter
(
"nothman-data/"
+
title
);
w
.
println
(
NewsArticleMiner
.
fetchArticleFromWeb
(
"http://newsstore.smh.com.au/apps/viewDocument.ac?docID="
+
title
));
title
=
urllist
.
get
(
i
);
PrintWriter
w
=
new
PrintWriter
(
"nothman-data/"
+
title
);
w
.
println
(
NewsArticleMiner
.
fetchArticleFromWeb
(
"http://newsstore.smh.com.au/apps/viewDocument.ac?docID="
+
title
));
w
.
close
();
}
}
public
static
void
main
(
String
[]
args
)
throws
BoilerpipeProcessingException
,
IOException
{
// NewsArticleMiner miner= new NewsArticleMiner(args[0]);
// miner.run();
dumpNewsArticle
(
"nothman-data/test.html"
,
"http://newsstore.smh.com.au/apps/viewDocument.ac?docID=SMH020520EV3D919FILN"
);
//getNothmanData(args[0]);
// newsArticleMiner.testSanityOfCorpus();
// newsArticleMiner.getCorpusStats();
// int c = 0;
// BufferedReader br = new BufferedReader(new FileReader(new File(
// "inIndexAndPopularNews")));
// String line;
// Citation citation = null;
// while ((line = br.readLine()) != null) {
// citation = Citation.gsonInstance.fromJson(line, Citation.class);
// createNewsArticle(citation.getHash(),citation.getURL());
// if(c++ % 100 ==0)
// {
// System.out.println("Done "+c);
// }
// }
NewsArticleMiner
miner
=
new
NewsArticleMiner
(
args
[
0
],
args
[
1
]);
miner
.
run
();
// dumpNewsArticle("nothman-data/test.html",
// "http://newsstore.smh.com.au/apps/viewDocument.ac?docID=SMH020520EV3D919FILN");
// getNothmanData(args[0]);
// newsArticleMiner.testSanityOfCorpus();
// newsArticleMiner.getCorpusStats();
// int c = 0;
// BufferedReader br = new BufferedReader(new FileReader(new File(
// "inIndexAndPopularNews")));
// String line;
// Citation citation = null;
// while ((line = br.readLine()) != null) {
// citation = Citation.gsonInstance.fromJson(line, Citation.class);
// createNewsArticle(citation.getHash(),citation.getURL());
// if(c++ % 100 ==0)
// {
// System.out.println("Done "+c);
// }
// }
}
}
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment