public Spider(Frontier frontier, Index index, Filtering.Filter filter, Action<Index> callback) { this.frontier = frontier; this.index = Index.CreateEmptyCopy(index); this.filter = filter; this.callback = callback; }
public void MergeIn(Index index) { List<Document> unique = new List<Document>(index.sites); for (int i = 0; i < unique.Count; i++) foreach (var s in sites) { double simi = similarity.CalculateSimilarity(s, unique[i]); if (simi >= 0.9) { unique.RemoveAt(i--); break; } } foreach (var doc in unique) sites.Add(doc); foreach (var term in index.stems.Keys) { if (!stems.ContainsKey(term)) stems.Add(term, index.stems[term]); else stems[term].MergeInto(index.stems[term], (a, b) => a.Document.Id.CompareTo(b.Document.Id), d => !unique.Contains(d.Document)); } }
public Ranker(Index index, TermStemmer stemmer) { int docCount = index.SiteCount; foreach (var t in index.GetStems()) tf_idf.Add(t.Key, getTF_IDF(t.Value, docCount)); Document[] keys = lengths.Keys.ToArray(); foreach (var d in keys) lengths[d] = Math.Sqrt(lengths[d]); this.index = index; this.stemmer = stemmer; //this.TF_WT = getTF_WT(); //this.IDF_WT = getIDF_WT(); //this.TF_IDF_WT = getTF_IDF_WT(); //this.NORM_WT = getNORM_WT(); }
public static void StartAndWait(Frontier frontier, Index index, Filtering.Filter filter, int pagecount) { int count = (int)Math.Ceiling(pagecount / (double)SPIDER_PAGE_COUNT); Spider[] spiders = new Spider[count]; Thread[] threads = new Thread[count]; for (int i = 0; i < count; i++) { Spider sp = spiders[i] = new Spider(frontier, index, filter, ind => { Console.ForegroundColor = ConsoleColor.Cyan; Console.WriteLine("Merging Index of {0}", ind.SiteCount); Console.ForegroundColor = ConsoleColor.Gray; lock (index) { index.MergeIn(ind); } }); threads[i] = new Thread(() => sp.Run()); threads[i].Start(); } for (int i = 0; i < count; i++) threads[i].Join(); }
public static Index CreateEmptyCopy(Index copyFrom) { return new Index(copyFrom.stemmer, copyFrom.similarity); }
public static Index CreateEmptyCopy(Index copyFrom) { return(new Index(copyFrom.stemmer, copyFrom.similarity)); }