Commit 2ce3f0c7 authored by Shyam Upadhyay's avatar Shyam Upadhyay
Browse files

save

parent 4bf95819
......@@ -26,15 +26,15 @@ import edu.illinois.cs.cogcomp.wikifier.utils.lucene.Lucene;
public class IndexWikiLinkStructure {
// public static String indexPath = "/shared/bronte/upadhya3/data/WikiLinkStructureIndex";
public static String indexPath = "testIndex";
public static String indexPath = "/shared/bronte/upadhya3/data/WikiLinkStructureIndex";
// public static String indexPath = "testIndex";
public static void index() throws IOException {
String dumpPath;
// dumpPath = "/shared/bronte/cheng88/wikidump/enwiki-latest-pages-articles2014Jan.xml.bz2";
dumpPath = "/shared/bronte/cheng88/wikidump/enwiki-latest-pages-articles2014Jan.xml.bz2";
// for test driving purposes
dumpPath ="enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2";
// dumpPath ="enwiki-latest-pages-articles1.xml-p000000010p000010000.bz2";
String bz2Filename = dumpPath;
try {
......
......@@ -32,8 +32,11 @@ import edu.illinois.cs.cogcomp.core.utilities.commands.InteractiveShell;
import edu.illinois.cs.cogcomp.wikifier.utils.lucene.Lucene;
/**
* @author cheng88
* @modified upadhya3
* Indexes the edge structure in the wikipedia graph. Every edge is stored as a
* doc with 3 fields, the src page,the target page and the surface form for the link.
* The links in a page are available thru the wikifier view.
*
* @author upadhya3
*
*/
......@@ -62,7 +65,8 @@ public class WikiLinkStructureIndexer {
}
public WikiLinkStructureIndexer(String path, boolean readOnly) {
logger.warn("Init wiki link structure indexer in read mode from ..." + path);
logger.warn("Init wiki link structure indexer in read mode from ..."
+ path);
try {
IndexReader reader = Lucene.reader(path);
searcher = new IndexSearcher(reader);
......@@ -88,7 +92,7 @@ public class WikiLinkStructureIndexer {
List<Document> results = Lists.newArrayList();
try {
Query q = new TermQuery(new Term("target", target_title));
// System.out.println(q.toString());
// System.out.println(q.toString());
for (ScoreDoc sc : searcher.search(q, 10000).scoreDocs) {
results.add(searcher.doc(sc.doc));
}
......@@ -113,36 +117,35 @@ public class WikiLinkStructureIndexer {
}
}
public static void indexData() throws IOException
{
public static void indexData() throws IOException {
IndexWikiLinkStructure.index();
}
/**
*
* @param args
* @throws Exception
*/
public static void testIndex() throws Exception {
WikiLinkStructureIndexer indexer = new WikiLinkStructureIndexer(IndexWikiLinkStructure.indexPath);
// List<Document> docs = indexer.searchTitleInLinks("United_States"); // 589 on small
WikiLinkStructureIndexer indexer = new WikiLinkStructureIndexer(
IndexWikiLinkStructure.indexPath);
// List<Document> docs = indexer.searchTitleInLinks("United_States"); //
// 589 on small
List<Document> docs = indexer.searchTitleInLinks("World_War_II"); // 469
// IndexReader reader = indexer.searcher.getIndexReader();
for(Document doc:docs)
{
// System.out.println("SRC "+doc.get("src"));
System.out.println("TARGET "+doc.get("target"));
// System.out.println("EDGE "+doc.get("edgeLabel"));
// IndexReader reader = indexer.searcher.getIndexReader();
for (Document doc : docs) {
// System.out.println("SRC "+doc.get("src"));
System.out.println("TARGET " + doc.get("target"));
// System.out.println("EDGE "+doc.get("edgeLabel"));
}
System.out.println("ANS SIZE: "+docs.size());
// for(int i=0;i<indexer.numDocs();i++)
// {
// Document doc = reader.document(i);
// System.out.println("SRC "+doc.get("src"));
// System.out.println("TARGET "+doc.get("target"));
// System.out.println("EDGE "+doc.get("edgeLabel"));
// }
System.out.println("ANS SIZE: " + docs.size());
// for(int i=0;i<indexer.numDocs();i++)
// {
// Document doc = reader.document(i);
// System.out.println("SRC "+doc.get("src"));
// System.out.println("TARGET "+doc.get("target"));
// System.out.println("EDGE "+doc.get("edgeLabel"));
// }
}
}
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment