Пример #1
0
 public override void EstablishContext()
 {
     tree = new UrlTree<string>("chris");
         tree.Add("damon");
         tree.Add("mason");
         tree.Add("clara");
 }
        private void AddToUrlTree(Pair <UrlTree, Queue <TextBlockHistoryEntry> > textBlockInfo, string responseUrl, ArrayList <ulong> hashCodes, bool fullPath, string corpusId,
                                  string documentId, string domainName, DateTime time, bool incDocCount)
        {
            UrlTree urlTree = textBlockInfo.First;
            Queue <TextBlockHistoryEntry> queue        = textBlockInfo.Second;
            TextBlockHistoryEntry         historyEntry = new TextBlockHistoryEntry(responseUrl, hashCodes, fullPath, time, /*decDocCount=*/ incDocCount);

            urlTree.Insert(responseUrl, hashCodes, mMinNodeDocCount, fullPath, /*insertUnique=*/ true, incDocCount);
            queue.Enqueue(historyEntry);
        }
Пример #3
0
        private void AddToUrlTree(Pair <UrlTree, Queue <TextBlockHistoryEntry> > textBlockInfo, string responseUrl, ArrayList <ulong> hashCodes, bool fullPath, string corpusId,
                                  string documentId, string domainName, DateTime time, bool incDocCount)
        {
            UrlTree urlTree = textBlockInfo.First;
            Queue <TextBlockHistoryEntry> queue        = textBlockInfo.Second;
            TextBlockHistoryEntry         historyEntry = new TextBlockHistoryEntry(responseUrl, hashCodes, fullPath, time, /*decDocCount=*/ incDocCount);

            urlTree.Insert(responseUrl, hashCodes, mMinNodeDocCount, fullPath, /*insertUnique=*/ true, incDocCount);
            queue.Enqueue(historyEntry);
            if (mDbConnection != null)
            {
                BinarySerializer memSer = new BinarySerializer();
                historyEntry.mHashCodes.Save(memSer);
                byte[] buffer          = ((MemoryStream)memSer.Stream).GetBuffer();
                string hashCodesBase64 = Convert.ToBase64String(buffer, 0, (int)memSer.Stream.Position);
                mDbConnection.ExecuteNonQuery("insert into TextBlocks (corpusId, docId, hashCodes) values (?, ?, ?)", corpusId, documentId, hashCodesBase64);
            }
        }
Пример #4
0
        protected override object ProcessData(IDataProducer sender, object data)
        {
            DocumentCorpus corpus = (DocumentCorpus)data;

            try
            {
                // split corpus according to document domain names
                Dictionary <string, ArrayList <Document> > domainDocCollections = new Dictionary <string, ArrayList <Document> >();
                foreach (Document document in corpus.Documents)
                {
                    try
                    {
                        string responseUrl = document.Features.GetFeatureValue("responseUrl");
                        if (responseUrl == null)
                        {
                            continue;
                        }
                        bool   blacklisted;
                        string urlKey = mUrlNormalizer.NormalizeUrl(responseUrl, document.Name, out blacklisted, UrlNormalizer.NormalizationMode.Heuristics);
                        document.Features.SetFeatureValue("blacklisted", blacklisted.ToString());
                        document.Features.SetFeatureValue("urlKey", urlKey);
                        string domainName = GetDomainName(urlKey);
                        document.Features.SetFeatureValue("domainName", domainName);
                        ArrayList <Document> domainDocs;
                        if (!domainDocCollections.TryGetValue(domainName, out domainDocs))
                        {
                            domainDocCollections.Add(domainName, domainDocs = new ArrayList <Document>());
                        }
                        domainDocs.Add(document);
                    }
                    catch (Exception exception)
                    {
                        mLogger.Error("ProcessDocument", exception);
                    }
                }
                // lock and process each domain separately
                foreach (KeyValuePair <string, ArrayList <Document> > domainInfo in domainDocCollections)
                {
                    string domainName = domainInfo.Key;
                    Pair <Dictionary <string, Ref <int> >, Queue <UrlHistoryEntry> > urlInfo = GetUrlInfo(domainName);
                    Pair <UrlTree, Queue <TextBlockHistoryEntry> > textBlockInfo             = GetTextBlockInfo(domainName);
                    lock (AcquireLock(domainName)) // domain lock acquired
                    {
                        DateTime maxTime = DateTime.MinValue;
                        // detect duplicates
                        foreach (Document document in domainInfo.Value)
                        {
                            try
                            {
                                DateTime time = DateTime.Parse(document.Features.GetFeatureValue("time"));
                                if (time > maxTime)
                                {
                                    maxTime = time;
                                }
                                string urlKey = document.Features.GetFeatureValue("urlKey");
                                bool   cached = urlInfo.First.ContainsKey(urlKey);
                                document.Features.SetFeatureValue("rev", "1");
                                if (cached)
                                {
                                    Ref <int> revInfo = urlInfo.First[urlKey];
                                    revInfo.Val++;
                                    document.Features.SetFeatureValue("rev", revInfo.Val.ToString());
                                    continue;
                                }
                                AddToUrlCache(urlKey, time, urlInfo);
                            }
                            catch (Exception exception)
                            {
                                mLogger.Error("ProcessDocument", exception);
                            }
                        }
                        // populate URL tree
                        ArrayList <ArrayList <ulong> > corpusHashCodes = new ArrayList <ArrayList <ulong> >();
                        foreach (Document document in domainInfo.Value)
                        {
                            try
                            {
                                string contentType = document.Features.GetFeatureValue("contentType");
                                if (contentType != "Text")
                                {
                                    continue;
                                }
                                string            docUrl    = document.Features.GetFeatureValue("responseUrl");
                                string            urlKey    = document.Features.GetFeatureValue("urlKey");
                                TextBlock[]       blocks    = document.GetAnnotatedBlocks(mBlockSelector);
                                ArrayList <ulong> hashCodes = new ArrayList <ulong>();
                                for (int i = 0; i < blocks.Length; i++)
                                {
                                    TextBlock block = blocks[i];
                                    hashCodes.Add(UrlTree.ComputeHashCode(block.Text, /*alphaOnly=*/ true));
                                }
                                if (document.Features.GetFeatureValue("rev") == "1")
                                {
                                    bool   fullPath   = urlKey.Contains("?");
                                    string documentId = document.Features.GetFeatureValue("guid").Replace("-", "");
                                    string corpusId   = corpus.Features.GetFeatureValue("guid").Replace("-", "");
                                    AddToUrlTree(textBlockInfo, docUrl, hashCodes, fullPath, corpusId, documentId, domainName, DateTime.Parse(document.Features.GetFeatureValue("time")), /*incDocCount=*/ true);
                                }
                                corpusHashCodes.Add(hashCodes);
                            }
                            catch (Exception exception)
                            {
                                mLogger.Error("ProcessDocument", exception);
                            }
                        }
                        // annotate boilerplate
                        int docIdx = 0;
                        foreach (Document document in domainInfo.Value)
                        {
                            try
                            {
                                string contentType = document.Features.GetFeatureValue("contentType");
                                if (contentType != "Text")
                                {
                                    continue;
                                }
                                string             docUrl = document.Features.GetFeatureValue("responseUrl");
                                string             urlKey = document.Features.GetFeatureValue("urlKey");
                                Ref <int>          revInfo = urlInfo.First[urlKey];
                                TextBlock[]        blocks = document.GetAnnotatedBlocks(mBlockSelector);
                                ArrayList <ulong>  hashCodes = corpusHashCodes[docIdx++]; // document's hash codes
                                UrlTree            urlTree = GetTextBlockInfo(domainName).First;
                                UrlTree.NodeInfo[] result = urlTree.Query(docUrl, hashCodes, mMinNodeDocCount, /*fullPath=*/ urlKey.Contains("?"));
                                int bpCharCount = 0, contentCharCount = 0, unseenContentCharCount = 0;
                                ArrayList <ulong> unseenContentHashCodes = new ArrayList <ulong>();
                                for (int i = 0; i < blocks.Length; i++)
                                {
                                    TextBlock block    = blocks[i];
                                    string    pathInfo = GetPathInfo(result, i);
                                    SetBlockAnnotation(document, result, mHeuristicsType, i, pathInfo, block);
                                    if (block.Annotation.Type == "TextBlock/Boilerplate")
                                    {
                                        bpCharCount += block.Text.Length;
                                    }
                                    else
                                    {
                                        contentCharCount += block.Text.Length;
                                    }
                                    if (block.Annotation.Type == "TextBlock/Content/Unseen")
                                    {
                                        unseenContentCharCount += block.Text.Length;
                                        unseenContentHashCodes.Add(hashCodes[i]);
                                    }
                                }
                                document.Features.SetFeatureValue("bprBoilerplateCharCount", bpCharCount.ToString());
                                document.Features.SetFeatureValue("bprContentCharCount", contentCharCount.ToString());
                                if (document.Features.GetFeatureValue("rev") != "1")
                                {
                                    document.Features.SetFeatureValue("unseenContentCharCount", unseenContentCharCount.ToString());
                                    if (unseenContentCharCount > mExactDuplicateThreshold)
                                    {
                                        document.Features.SetFeatureValue("unseenContent", "Yes");
                                        string   documentId = document.Features.GetFeatureValue("guid").Replace("-", "");
                                        string   corpusId   = corpus.Features.GetFeatureValue("guid").Replace("-", "");
                                        DateTime time       = DateTime.Parse(document.Features.GetFeatureValue("time"));
                                        AddToUrlTree(textBlockInfo, docUrl, unseenContentHashCodes, /*fullPath=*/ urlKey.Contains("?"), corpusId, documentId, domainName, time, /*incDocCount=*/ false);
                                        AddToUrlCache(/*urlKey=*/ null, time, urlInfo); // dummy entry into the URL queue (to ensure sync with the text blocks queue)
                                    }
                                    else
                                    {
                                        document.Features.SetFeatureValue("unseenContent", "No");
                                    }
                                }
                            }
                            catch (Exception exception)
                            {
                                mLogger.Error("ProcessDocument", exception);
                            }
                        }
                        if (maxTime != DateTime.MinValue)
                        {
                            RemoveItems(urlInfo, textBlockInfo, maxTime);
                        }
                    } // domain lock released
                }
            }
            catch (Exception exception)
            {
                mLogger.Error("ProcessData", exception);
            }
            return(corpus);
        }