public void crawler(Object seed) { List <string> all_links = new List <string>(); all_links.Add((string)seed); //seed List <string> all_documents = new List <string>(); AllPage page; IRdbEntities db = new IRdbEntities(); for (int i = 0; i < 5000 && i < all_links.Count; i++) { HtmlAgilityPack.HtmlDocument doc = new HtmlAgilityPack.HtmlDocument(); doc = load(all_links[i]); if (doc != null) { string myBody = parseText(doc); if (myBody != null) //no body { all_documents.Add(myBody); List <string> links = getLinks(doc, all_links[i]); all_links = new_list(all_links, links); page = new AllPage(); page.linkUrl = all_links[i]; page.mycontent = myBody; db.AllPages.Add(page); db.SaveChanges(); if (i == all_links.Count && links.Count == 0)//last Page in Crawler { break; } } } } db.SaveChanges(); }
void AllDistictItemskGram() { IRdbEntities db = new IRdbEntities(); List <kGramIndex> kGrams = db.kGramIndexes.ToList(); foreach (var kgrm in kGrams) { DistinctkGramsIndex k = new DistinctkGramsIndex(); k.k_gram = kgrm.k_gram; String[] words = kgrm.terms.Split(',').Distinct().ToArray(); Array.Sort(words); k.terms = string.Join(",", words); db.DistinctkGramsIndexes.Add(k); db.SaveChanges(); } db.SaveChanges(); }
void DistictItemsSoundex() { IRdbEntities db = new IRdbEntities(); List <SoundexIndex> sounds = db.SoundexIndexes.ToList(); foreach (var sound in sounds) { DistinctSoundexIndex s = new DistinctSoundexIndex(); s.soundex = sound.soundex; String[] words = sound.items.Split(',').Distinct().ToArray(); Array.Sort(words); s.items = string.Join(",", words); db.DistinctSoundexIndexes.Add(s); db.SaveChanges(); } db.SaveChanges(); }
public void fillSoundexIndex() { IRdbEntities db = new IRdbEntities(); SoundexIndex soun; var l = SoundexIndex.OrderBy(key => key.Key); var dic = l.ToDictionary((keyItem) => keyItem.Key, (valueItem) => valueItem.Value); foreach (var sound in dic) { string combindedSoundes = string.Join(",", sound.Value.ToArray()); soun = new SoundexIndex(); soun.soundex = sound.Key; soun.items = combindedSoundes; db.SoundexIndexes.Add(soun); db.SaveChanges(); } }
public void fillBgramIndex() { IRdbEntities db = new IRdbEntities(); kGramIndex kG; var l = BgramIndex.OrderBy(key => key.Key); var dic = l.ToDictionary((keyItem) => keyItem.Key, (valueItem) => valueItem.Value); foreach (var gram in dic) { string combindedGrams = string.Join(",", gram.Value.ToArray()); kG = new kGramIndex(); kG.k_gram = gram.Key; kG.terms = combindedGrams; db.kGramIndexes.Add(kG); db.SaveChanges(); } }
public void fillInvertedIndex() { IRdbEntities db = new IRdbEntities(); InvertedIndex inver; invertedIndex = invertedIndex.OrderBy(x => x.docID).ThenBy(x => x.mToken).ToList(); foreach (var inv in invertedIndex) { try { string combindedpos = string.Join(",", inv.docPositions.ToArray()); inver = new InvertedIndex(); inver.docID = inv.docID; inver.frequency = inv.Freq; inver.term = inv.mToken; inver.positions = combindedpos; db.InvertedIndexes.Add(inver); db.SaveChanges(); } catch (Exception e) { }; } }
public void Indexing() { IRdbEntities db = new IRdbEntities(); AllDocuments = db.AllPages.ToList(); startIndexing(); Thread thread20 = new Thread(fillInvertedIndex); thread20.Start(); Thread thread21 = new Thread(fillBgramIndex); thread21.Start(); Thread thread22 = new Thread(fillSoundexIndex); thread21.Start(); thread22.Start(); thread21.Join(); thread22.Join(); DistictItemsSoundex(); AllDistictItemskGram(); }