Esempio n. 1
0
        protected override object ProcessData(IDataProducer sender, object data)
        {
            Utils.ThrowException(!(data is string[]) ? new ArgumentTypeException("data") : null);
            DateTime       timeStart = DateTime.Now;
            DocumentCorpus corpus    = new DocumentCorpus();

            foreach (string line in (string[])data)
            {
                int      splitIdx = line.IndexOfAny(new char[] { ' ', '\t', '\n' });
                Document doc;
                if (!mIsNamedDoc || splitIdx < 0)
                {
                    doc = new Document("", line.Trim());
                }
                else
                {
                    doc = new Document(line.Substring(0, splitIdx).Trim(), line.Substring(splitIdx).Trim());
                }
                doc.Features.SetFeatureValue("_time", DateTime.Now.ToString(Utils.DATE_TIME_SIMPLE));
                corpus.AddDocument(doc);
            }
            corpus.Features.SetFeatureValue("_provider", GetType().ToString());
            corpus.Features.SetFeatureValue("_isNamedDoc", mIsNamedDoc.ToString());
            corpus.Features.SetFeatureValue("_timeStart", timeStart.ToString(Utils.DATE_TIME_SIMPLE));
            corpus.Features.SetFeatureValue("_timeEnd", DateTime.Now.ToString(Utils.DATE_TIME_SIMPLE));
            return(corpus);
        }
Esempio n. 2
0
        protected override void ConsumeData(IDataProducer sender, object data)
        {
            Utils.ThrowException(!(data is DocumentCorpus) ? new ArgumentTypeException("data") : null);
            DocumentCorpus corpus = (DocumentCorpus)data;

            foreach (Document document in corpus.Documents)
            {
                ConsumeDocument(document);
            }
        }
Esempio n. 3
0
        public /*protected*/ override object ProcessData(IDataProducer sender, object data)
        {
            Utils.ThrowException(!(data is DocumentCorpus) ? new ArgumentTypeException("data") : null);
            DocumentCorpus corpus = (DocumentCorpus)data;

            foreach (Document document in corpus.Documents)
            {
                ProcessDocument(document);
            }
            return(corpus);
        }
Esempio n. 4
0
        protected override object ProcessData(IDataProducer sender, object data)
        {
            DocumentCorpus corpus = (DocumentCorpus)data;

            try
            {
                // split corpus according to document domain names
                Dictionary <string, ArrayList <Document> > domainDocCollections = new Dictionary <string, ArrayList <Document> >();
                foreach (Document document in corpus.Documents)
                {
                    try
                    {
                        string responseUrl = document.Features.GetFeatureValue("responseUrl");
                        if (responseUrl == null)
                        {
                            continue;
                        }
                        bool   blacklisted;
                        string urlKey = mUrlNormalizer.NormalizeUrl(responseUrl, document.Name, out blacklisted, UrlNormalizer.NormalizationMode.Heuristics);
                        document.Features.SetFeatureValue("blacklisted", blacklisted.ToString());
                        document.Features.SetFeatureValue("urlKey", urlKey);
                        string domainName = GetDomainName(urlKey);
                        document.Features.SetFeatureValue("domainName", domainName);
                        ArrayList <Document> domainDocs;
                        if (!domainDocCollections.TryGetValue(domainName, out domainDocs))
                        {
                            domainDocCollections.Add(domainName, domainDocs = new ArrayList <Document>());
                        }
                        domainDocs.Add(document);
                    }
                    catch (Exception exception)
                    {
                        mLogger.Error("ProcessDocument", exception);
                    }
                }
                // lock and process each domain separately
                foreach (KeyValuePair <string, ArrayList <Document> > domainInfo in domainDocCollections)
                {
                    string domainName = domainInfo.Key;
                    Pair <Dictionary <string, Ref <int> >, Queue <UrlHistoryEntry> > urlInfo = GetUrlInfo(domainName);
                    Pair <UrlTree, Queue <TextBlockHistoryEntry> > textBlockInfo             = GetTextBlockInfo(domainName);
                    lock (AcquireLock(domainName)) // domain lock acquired
                    {
                        DateTime maxTime = DateTime.MinValue;
                        // detect duplicates
                        foreach (Document document in domainInfo.Value)
                        {
                            try
                            {
                                DateTime time = DateTime.Parse(document.Features.GetFeatureValue("time"));
                                if (time > maxTime)
                                {
                                    maxTime = time;
                                }
                                string urlKey = document.Features.GetFeatureValue("urlKey");
                                bool   cached = urlInfo.First.ContainsKey(urlKey);
                                document.Features.SetFeatureValue("rev", "1");
                                if (cached)
                                {
                                    Ref <int> revInfo = urlInfo.First[urlKey];
                                    revInfo.Val++;
                                    document.Features.SetFeatureValue("rev", revInfo.Val.ToString());
                                    continue;
                                }
                                AddToUrlCache(urlKey, time, urlInfo);
                            }
                            catch (Exception exception)
                            {
                                mLogger.Error("ProcessDocument", exception);
                            }
                        }
                        // populate URL tree
                        ArrayList <ArrayList <ulong> > corpusHashCodes = new ArrayList <ArrayList <ulong> >();
                        foreach (Document document in domainInfo.Value)
                        {
                            try
                            {
                                string contentType = document.Features.GetFeatureValue("contentType");
                                if (contentType != "Text")
                                {
                                    continue;
                                }
                                string            docUrl    = document.Features.GetFeatureValue("responseUrl");
                                string            urlKey    = document.Features.GetFeatureValue("urlKey");
                                TextBlock[]       blocks    = document.GetAnnotatedBlocks(mBlockSelector);
                                ArrayList <ulong> hashCodes = new ArrayList <ulong>();
                                for (int i = 0; i < blocks.Length; i++)
                                {
                                    TextBlock block = blocks[i];
                                    hashCodes.Add(UrlTree.ComputeHashCode(block.Text, /*alphaOnly=*/ true));
                                }
                                if (document.Features.GetFeatureValue("rev") == "1")
                                {
                                    bool   fullPath   = urlKey.Contains("?");
                                    string documentId = document.Features.GetFeatureValue("guid").Replace("-", "");
                                    string corpusId   = corpus.Features.GetFeatureValue("guid").Replace("-", "");
                                    AddToUrlTree(textBlockInfo, docUrl, hashCodes, fullPath, corpusId, documentId, domainName, DateTime.Parse(document.Features.GetFeatureValue("time")), /*incDocCount=*/ true);
                                }
                                corpusHashCodes.Add(hashCodes);
                            }
                            catch (Exception exception)
                            {
                                mLogger.Error("ProcessDocument", exception);
                            }
                        }
                        // annotate boilerplate
                        int docIdx = 0;
                        foreach (Document document in domainInfo.Value)
                        {
                            try
                            {
                                string contentType = document.Features.GetFeatureValue("contentType");
                                if (contentType != "Text")
                                {
                                    continue;
                                }
                                string             docUrl = document.Features.GetFeatureValue("responseUrl");
                                string             urlKey = document.Features.GetFeatureValue("urlKey");
                                Ref <int>          revInfo = urlInfo.First[urlKey];
                                TextBlock[]        blocks = document.GetAnnotatedBlocks(mBlockSelector);
                                ArrayList <ulong>  hashCodes = corpusHashCodes[docIdx++]; // document's hash codes
                                UrlTree            urlTree = GetTextBlockInfo(domainName).First;
                                UrlTree.NodeInfo[] result = urlTree.Query(docUrl, hashCodes, mMinNodeDocCount, /*fullPath=*/ urlKey.Contains("?"));
                                int bpCharCount = 0, contentCharCount = 0, unseenContentCharCount = 0;
                                ArrayList <ulong> unseenContentHashCodes = new ArrayList <ulong>();
                                for (int i = 0; i < blocks.Length; i++)
                                {
                                    TextBlock block    = blocks[i];
                                    string    pathInfo = GetPathInfo(result, i);
                                    SetBlockAnnotation(document, result, mHeuristicsType, i, pathInfo, block);
                                    if (block.Annotation.Type == "TextBlock/Boilerplate")
                                    {
                                        bpCharCount += block.Text.Length;
                                    }
                                    else
                                    {
                                        contentCharCount += block.Text.Length;
                                    }
                                    if (block.Annotation.Type == "TextBlock/Content/Unseen")
                                    {
                                        unseenContentCharCount += block.Text.Length;
                                        unseenContentHashCodes.Add(hashCodes[i]);
                                    }
                                }
                                document.Features.SetFeatureValue("bprBoilerplateCharCount", bpCharCount.ToString());
                                document.Features.SetFeatureValue("bprContentCharCount", contentCharCount.ToString());
                                if (document.Features.GetFeatureValue("rev") != "1")
                                {
                                    document.Features.SetFeatureValue("unseenContentCharCount", unseenContentCharCount.ToString());
                                    if (unseenContentCharCount > mExactDuplicateThreshold)
                                    {
                                        document.Features.SetFeatureValue("unseenContent", "Yes");
                                        string   documentId = document.Features.GetFeatureValue("guid").Replace("-", "");
                                        string   corpusId   = corpus.Features.GetFeatureValue("guid").Replace("-", "");
                                        DateTime time       = DateTime.Parse(document.Features.GetFeatureValue("time"));
                                        AddToUrlTree(textBlockInfo, docUrl, unseenContentHashCodes, /*fullPath=*/ urlKey.Contains("?"), corpusId, documentId, domainName, time, /*incDocCount=*/ false);
                                        AddToUrlCache(/*urlKey=*/ null, time, urlInfo); // dummy entry into the URL queue (to ensure sync with the text blocks queue)
                                    }
                                    else
                                    {
                                        document.Features.SetFeatureValue("unseenContent", "No");
                                    }
                                }
                            }
                            catch (Exception exception)
                            {
                                mLogger.Error("ProcessDocument", exception);
                            }
                        }
                        if (maxTime != DateTime.MinValue)
                        {
                            RemoveItems(urlInfo, textBlockInfo, maxTime);
                        }
                    } // domain lock released
                }
            }
            catch (Exception exception)
            {
                mLogger.Error("ProcessData", exception);
            }
            return(corpus);
        }