public override void EstablishContext() { tree = new UrlTree<string>("chris"); tree.Add("damon"); tree.Add("mason"); tree.Add("clara"); }
private void AddToUrlTree(Pair <UrlTree, Queue <TextBlockHistoryEntry> > textBlockInfo, string responseUrl, ArrayList <ulong> hashCodes, bool fullPath, string corpusId, string documentId, string domainName, DateTime time, bool incDocCount) { UrlTree urlTree = textBlockInfo.First; Queue <TextBlockHistoryEntry> queue = textBlockInfo.Second; TextBlockHistoryEntry historyEntry = new TextBlockHistoryEntry(responseUrl, hashCodes, fullPath, time, /*decDocCount=*/ incDocCount); urlTree.Insert(responseUrl, hashCodes, mMinNodeDocCount, fullPath, /*insertUnique=*/ true, incDocCount); queue.Enqueue(historyEntry); }
private void AddToUrlTree(Pair <UrlTree, Queue <TextBlockHistoryEntry> > textBlockInfo, string responseUrl, ArrayList <ulong> hashCodes, bool fullPath, string corpusId, string documentId, string domainName, DateTime time, bool incDocCount) { UrlTree urlTree = textBlockInfo.First; Queue <TextBlockHistoryEntry> queue = textBlockInfo.Second; TextBlockHistoryEntry historyEntry = new TextBlockHistoryEntry(responseUrl, hashCodes, fullPath, time, /*decDocCount=*/ incDocCount); urlTree.Insert(responseUrl, hashCodes, mMinNodeDocCount, fullPath, /*insertUnique=*/ true, incDocCount); queue.Enqueue(historyEntry); if (mDbConnection != null) { BinarySerializer memSer = new BinarySerializer(); historyEntry.mHashCodes.Save(memSer); byte[] buffer = ((MemoryStream)memSer.Stream).GetBuffer(); string hashCodesBase64 = Convert.ToBase64String(buffer, 0, (int)memSer.Stream.Position); mDbConnection.ExecuteNonQuery("insert into TextBlocks (corpusId, docId, hashCodes) values (?, ?, ?)", corpusId, documentId, hashCodesBase64); } }
protected override object ProcessData(IDataProducer sender, object data) { DocumentCorpus corpus = (DocumentCorpus)data; try { // split corpus according to document domain names Dictionary <string, ArrayList <Document> > domainDocCollections = new Dictionary <string, ArrayList <Document> >(); foreach (Document document in corpus.Documents) { try { string responseUrl = document.Features.GetFeatureValue("responseUrl"); if (responseUrl == null) { continue; } bool blacklisted; string urlKey = mUrlNormalizer.NormalizeUrl(responseUrl, document.Name, out blacklisted, UrlNormalizer.NormalizationMode.Heuristics); document.Features.SetFeatureValue("blacklisted", blacklisted.ToString()); document.Features.SetFeatureValue("urlKey", urlKey); string domainName = GetDomainName(urlKey); document.Features.SetFeatureValue("domainName", domainName); ArrayList <Document> domainDocs; if (!domainDocCollections.TryGetValue(domainName, out domainDocs)) { domainDocCollections.Add(domainName, domainDocs = new ArrayList <Document>()); } domainDocs.Add(document); } catch (Exception exception) { mLogger.Error("ProcessDocument", exception); } } // lock and process each domain separately foreach (KeyValuePair <string, ArrayList <Document> > domainInfo in domainDocCollections) { string domainName = domainInfo.Key; Pair <Dictionary <string, Ref <int> >, Queue <UrlHistoryEntry> > urlInfo = GetUrlInfo(domainName); Pair <UrlTree, Queue <TextBlockHistoryEntry> > textBlockInfo = GetTextBlockInfo(domainName); lock (AcquireLock(domainName)) // domain lock acquired { DateTime maxTime = DateTime.MinValue; // detect duplicates foreach (Document document in domainInfo.Value) { try { DateTime time = DateTime.Parse(document.Features.GetFeatureValue("time")); if (time > maxTime) { maxTime = time; } string urlKey = document.Features.GetFeatureValue("urlKey"); bool cached = urlInfo.First.ContainsKey(urlKey); document.Features.SetFeatureValue("rev", "1"); if (cached) { Ref <int> revInfo = urlInfo.First[urlKey]; revInfo.Val++; document.Features.SetFeatureValue("rev", revInfo.Val.ToString()); continue; } AddToUrlCache(urlKey, time, urlInfo); } catch (Exception exception) { mLogger.Error("ProcessDocument", exception); } } // populate URL tree ArrayList <ArrayList <ulong> > corpusHashCodes = new ArrayList <ArrayList <ulong> >(); foreach (Document document in domainInfo.Value) { try { string contentType = document.Features.GetFeatureValue("contentType"); if (contentType != "Text") { continue; } string docUrl = document.Features.GetFeatureValue("responseUrl"); string urlKey = document.Features.GetFeatureValue("urlKey"); TextBlock[] blocks = document.GetAnnotatedBlocks(mBlockSelector); ArrayList <ulong> hashCodes = new ArrayList <ulong>(); for (int i = 0; i < blocks.Length; i++) { TextBlock block = blocks[i]; hashCodes.Add(UrlTree.ComputeHashCode(block.Text, /*alphaOnly=*/ true)); } if (document.Features.GetFeatureValue("rev") == "1") { bool fullPath = urlKey.Contains("?"); string documentId = document.Features.GetFeatureValue("guid").Replace("-", ""); string corpusId = corpus.Features.GetFeatureValue("guid").Replace("-", ""); AddToUrlTree(textBlockInfo, docUrl, hashCodes, fullPath, corpusId, documentId, domainName, DateTime.Parse(document.Features.GetFeatureValue("time")), /*incDocCount=*/ true); } corpusHashCodes.Add(hashCodes); } catch (Exception exception) { mLogger.Error("ProcessDocument", exception); } } // annotate boilerplate int docIdx = 0; foreach (Document document in domainInfo.Value) { try { string contentType = document.Features.GetFeatureValue("contentType"); if (contentType != "Text") { continue; } string docUrl = document.Features.GetFeatureValue("responseUrl"); string urlKey = document.Features.GetFeatureValue("urlKey"); Ref <int> revInfo = urlInfo.First[urlKey]; TextBlock[] blocks = document.GetAnnotatedBlocks(mBlockSelector); ArrayList <ulong> hashCodes = corpusHashCodes[docIdx++]; // document's hash codes UrlTree urlTree = GetTextBlockInfo(domainName).First; UrlTree.NodeInfo[] result = urlTree.Query(docUrl, hashCodes, mMinNodeDocCount, /*fullPath=*/ urlKey.Contains("?")); int bpCharCount = 0, contentCharCount = 0, unseenContentCharCount = 0; ArrayList <ulong> unseenContentHashCodes = new ArrayList <ulong>(); for (int i = 0; i < blocks.Length; i++) { TextBlock block = blocks[i]; string pathInfo = GetPathInfo(result, i); SetBlockAnnotation(document, result, mHeuristicsType, i, pathInfo, block); if (block.Annotation.Type == "TextBlock/Boilerplate") { bpCharCount += block.Text.Length; } else { contentCharCount += block.Text.Length; } if (block.Annotation.Type == "TextBlock/Content/Unseen") { unseenContentCharCount += block.Text.Length; unseenContentHashCodes.Add(hashCodes[i]); } } document.Features.SetFeatureValue("bprBoilerplateCharCount", bpCharCount.ToString()); document.Features.SetFeatureValue("bprContentCharCount", contentCharCount.ToString()); if (document.Features.GetFeatureValue("rev") != "1") { document.Features.SetFeatureValue("unseenContentCharCount", unseenContentCharCount.ToString()); if (unseenContentCharCount > mExactDuplicateThreshold) { document.Features.SetFeatureValue("unseenContent", "Yes"); string documentId = document.Features.GetFeatureValue("guid").Replace("-", ""); string corpusId = corpus.Features.GetFeatureValue("guid").Replace("-", ""); DateTime time = DateTime.Parse(document.Features.GetFeatureValue("time")); AddToUrlTree(textBlockInfo, docUrl, unseenContentHashCodes, /*fullPath=*/ urlKey.Contains("?"), corpusId, documentId, domainName, time, /*incDocCount=*/ false); AddToUrlCache(/*urlKey=*/ null, time, urlInfo); // dummy entry into the URL queue (to ensure sync with the text blocks queue) } else { document.Features.SetFeatureValue("unseenContent", "No"); } } } catch (Exception exception) { mLogger.Error("ProcessDocument", exception); } } if (maxTime != DateTime.MinValue) { RemoveItems(urlInfo, textBlockInfo, maxTime); } } // domain lock released } } catch (Exception exception) { mLogger.Error("ProcessData", exception); } return(corpus); }