private void processregular(DecodedTextClass res, int threadid, int links_threshold = 5, int minimum_length=100) { string title = res.title.Trim(); int id = res.identifier; conceptdata.TryAdd(id, new conceptstats()); conceptdata[id].valid = true; //Console.WriteLine("Case 1"); conceptdata[id].title = title; HashSet<string> categories = new HashSet<string>(); HashSet<string> redirects = new HashSet<string>(); char[] chararray = new char[0]; int startindex = 0; int length = 0; int type = 0; int constructs = res.NumberWikiConstructs(); if (constructs < links_threshold) { //invalidate this concept conceptdata[id].valid = false; return; } if (res.NumberGoodWords() < minimum_length) { //invalidate this concept conceptdata[id].valid = false; return; } string curr, cat, redir; for (int i = 0; i < constructs; i++) { if (res.GetWikiConstruct(i, ref type, ref chararray, ref startindex, ref length)) { switch (type) { case 0: //Console.WriteLine("Case 0"); curr = new string(chararray, startindex, length); curr = curr.Trim(); // add to outlinks if (!conceptdata[id].outlinks.Contains(curr)) { conceptdata[id].outlinks.Add(curr); } break; case 1: //Console.WriteLine("Case 1"); cat = cleancategorytitle(new string(chararray, startindex, length), true); if (!categories.Contains(cat)) { categories.Add(cat); } break; case 2: //Console.WriteLine("Case 2"); redir = new string(chararray, startindex, length); if (!redirects.Contains(redir)) { redirects.Add(redir); //Console.WriteLine("{0}: {1}",res.title,redir); } break; default: break; } } } // set some of the other info conceptdata[id].categories = categories; conceptdata[id].redirects = redirects; //now process words processText(res,threadid); return; }
private void processcategory(DecodedTextClass res) { // structure to store category information //Dictionary<string, int> categorydict; //Dictionary<int, int[]> parentcategorydict; string modifiedtitle = cleancategorytitle(res.title, false); // string includes "category:" categorydict.TryAdd(modifiedtitle,res.identifier); char[] chararray = new char[0]; int startindex = 0; int length = 0; int type = 0; HashSet<string> categoriesarray = new HashSet<string>(); int n = res.NumberWikiConstructs(); for (int i = 0; i < n; i++) { if (res.GetWikiConstruct(i, ref type, ref chararray, ref startindex, ref length)) { switch (type) { case 1: string cat = cleancategorytitle(new string(chararray, startindex, length), true); if (!categoriesarray.Contains(cat)) { categoriesarray.Add(cat); } break; } } } parentcategorydict.TryAdd(modifiedtitle,categoriesarray); }
private void processdisambig(DecodedTextClass res) { HashSet<string> links = new HashSet<string>(); char[] chararray = new char[0]; int startindex = 0; int length = 0; int type = 0; int n = res.NumberWikiConstructs(); for (int i = 0; i < n; i++) { if (res.GetWikiConstruct(i, ref type, ref chararray, ref startindex, ref length)) { switch (type) { case 0: string curr = new string(chararray, startindex, length); curr = curr.Trim(); if (!links.Contains(curr)) { links.Add(curr); //Console.WriteLine(curr); } break; } } } //Console.WriteLine(res.title); disambigredirect.TryAdd(res.title, links); }
private bool isdisambig(DecodedTextClass res) { char[] chararray = new char[0]; int startindex = 0; int length = 0; int type = 0; List<int> categoriesarray = new List<int>(res.NumberWikiConstructs()); string cat; int n = res.NumberWikiConstructs(); for (int i = 0; i < n; i++) { if (res.GetWikiConstruct(i, ref type, ref chararray, ref startindex, ref length)) { switch (type) { case 1: // look at categories cat = cleancategorytitle(new string(chararray, startindex, length), true); // already open because processor gets rid of "Category:" if (cat == "Disambiguation pages") { //StreamWriter sw = new StreamWriter("complexdisambiguation.txt", true); //sw.WriteLine(res.title + "," + res.identifier); //sw.Close(); return true; } break; } } } return false; }