示例#1
0
        public static string ExtractInnerText(string input, string startString, string endString)
        {
            string inputToLowerInvariant = input.ToLowerInvariant();

            startString = startString.ToLowerInvariant();
            int rawIndex = inputToLowerInvariant.IndexOf(startString.Replace("|", ""));

            if (rawIndex == -1)
            {
                rawIndex = inputToLowerInvariant.IndexOf(startString.Replace("|", "'"));
            }

            if (rawIndex == -1)
            {
                rawIndex = inputToLowerInvariant.IndexOf(startString.Replace("|", "\""));
            }

            if (rawIndex != -1)
            {
                int rawIndexEnd = inputToLowerInvariant.IndexOf(endString.Replace("|", ""), rawIndex);

                if (rawIndexEnd == -1)
                {
                    rawIndexEnd = inputToLowerInvariant.IndexOf(endString.Replace("|", "'"));
                }

                if (rawIndexEnd == -1)
                {
                    rawIndexEnd = inputToLowerInvariant.IndexOf(endString.Replace("|", "\""));
                }

                if (rawIndexEnd != -1)
                {
                    if (rawIndexEnd - rawIndex >= 1)
                    {
                        string raw = input.Substring(rawIndex, rawIndexEnd - rawIndex);
                        string rawWithHyperLinks = raw;

                        rawWithHyperLinks = EncapsulateHyperLinks(rawWithHyperLinks);

                        raw = UserDefinedFunctions.ExtractText("<" + raw + ">").Value.Trim().TrimStart('<').TrimEnd('>').Trim().TrimStart('<').TrimEnd('>');
                        rawWithHyperLinks = UserDefinedFunctions.ExtractText("<" + rawWithHyperLinks + ">").Value.Trim().TrimStart('<').TrimEnd('>').Trim().TrimStart('<').TrimEnd('>');

                        return(raw + "|" + rawWithHyperLinks);
                    }
                    else
                    {
                    }
                }
            }

            return(null);
        }
        public static string Summarize(Query query, bool shouldDocumentsBeClustered, string discoveryPath, Encoding encoding, Cache cache)
        {
            StandardAnalyzer standardAnalyzer = new StandardAnalyzer();

            Highlighter highligher = new Highlighter(new QueryScorer(query));

            highligher.SetTextFragmenter(new SimpleFragmenter(150));

            string text = UserDefinedFunctions.ExtractText(File.ReadAllText(discoveryPath, encoding)).Value;

            TokenStream tokenStream = standardAnalyzer.TokenStream("text", new StringReader(text));

            return((highligher.GetBestFragments(tokenStream, text, 1, "...") + " ...").TrimStart(" ,".ToCharArray()));
        }
示例#3
0
        /// <summary>
        ///     Performs the action.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        public override void PerformAction(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            if (crawlRequest.IsDisallowed || !crawlRequest.ProcessData || crawlRequest.WebClient.WebException != null)
            {
                return;
            }

            if (crawlRequest.DataType.DiscoveryType == DiscoveryType.WebPage)
            {
                if (crawlRequest.Data != null)
                {
                    //uncomment to see what was analyzed...
                    string text = UserDefinedFunctions.ExtractText(crawlRequest.DecodedHtml).Value;

                    double class1Classification = _class1BayesianClassifier.Classify(crawlRequest.DecodedHtml);
                    double class2Classification = _class2BayesianClassifier.Classify(crawlRequest.DecodedHtml);

                    byte @class = DetermineClass(class1Classification, class2Classification, true);

                    double class1ClassificationFalsePositive = _class1BayesianClassifierFalsePositive.Classify(crawlRequest.DecodedHtml);
                    double class2ClassificationFalsePositive = _class2BayesianClassifierFalsePositive.Classify(crawlRequest.DecodedHtml);

                    switch (@class)
                    {
                    case 1:
                        if (class1ClassificationFalsePositive >= class1Classification)
                        {
                            @class = 0;
                        }
                        else
                        {
                        }
                        break;

                    case 2:
                        if (class2ClassificationFalsePositive >= class2Classification)
                        {
                            @class = 0;
                        }
                        else
                        {
                        }
                        break;
                    }
                }
            }
        }
示例#4
0
        private static IDictionary <string, XPathInfo> GenerateXPaths(HtmlNode htmlNode, string xpath, IDictionary <string, XPathInfo> xpathInfos)
        {
            if (htmlNode.NodeType == HtmlNodeType.Element)
            {
                xpath += "/" + htmlNode.Name;

                if (_allowedTagNames.Contains(htmlNode.Name) && !xpathInfos.ContainsKey(xpath))
                {
                    string innerText = UserDefinedFunctions.ExtractText(htmlNode.InnerHtml).Value;

                    if (!string.IsNullOrEmpty(innerText.Trim()))
                    {
                        XPathInfo xPathInfo = new XPathInfo();

                        xPathInfo.InnerText = innerText;
                        xPathInfo.Tag       = htmlNode.Name;
                        xPathInfo.XPath     = xpath;

                        xpathInfos.Add(xpath, xPathInfo);
                    }
                }

                if (xpathInfos.ContainsKey(xpath))
                {
                    xpathInfos[xpath].Count++;
                }

                Debug.Print(xpath);
            }

            foreach (HtmlNode childNode in htmlNode.ChildNodes)
            {
                if (childNode.NodeType == HtmlNodeType.Element)
                {
                    GenerateXPaths(childNode, xpath, xpathInfos);
                }
            }

            return(xpathInfos);
        }
示例#5
0
        private void btnEvaluateXPath_Click(object sender, EventArgs e)
        {
            try
            {
                Evaluate evaluate = new Evaluate();
                evaluate.dataGridView1.Rows.Clear();

                //foreach (HtmlNode htmlNode in _htmlDocument.DocumentNode.SelectSingleNode(tbXPath.Text))
                //{
                //    evaluate.dataGridView1.Rows.Add(UserDefinedFunctions.ExtractText(htmlNode.InnerHtml).Value);
                //}
                string innerHtml = _htmlDocument.DocumentNode.SelectSingleNode(tbXPath.Text).InnerHtml;

                evaluate.dataGridView1.Rows.Add(UserDefinedFunctions.ExtractText(innerHtml).Value);

                evaluate.ShowDialog(this);
            }
            catch (Exception exception)
            {
                MessageBox.Show(exception.Message + Environment.NewLine + exception.StackTrace, _formText);
            }
        }
示例#6
0
        void htmlElement_Click(object sender, HtmlElementEventArgs e)
        {
            if (wbBrowser.Document.ActiveElement != null)
            {
                foreach (HtmlNode htmlNode in _htmlDocument.DocumentNode.DescendantsAndSelf())
                {
                    if (htmlNode.GetAttributeValue("arachnode_scraper_id", string.Empty) == wbBrowser.Document.ActiveElement.TabIndex.ToString())
                    {
                        TreeNode treeNode = _treeNodes.Where(tn => (HtmlElement)tn.Tag == wbBrowser.Document.ActiveElement).First();
                        tvBrowser.SelectedNode  = treeNode;
                        tvBrowser.HideSelection = false;

                        tbXPath.Text  = htmlNode.XPath;
                        tbResult.Text = null;
                        if (!string.IsNullOrEmpty(wbBrowser.Document.ActiveElement.InnerHtml))
                        {
                            tbResult.Text = UserDefinedFunctions.ExtractText(wbBrowser.Document.ActiveElement.InnerHtml).Value;
                        }
                    }
                }
            }
        }
        /// <summary>
        ///     Manages the web page.
        /// </summary>
        /// <param name = "webPageID">The web page ID.</param>
        /// <param name = "absoluteUri">The absolute URI.</param>
        /// <param name = "source">The source.</param>
        /// <param name = "encoding">The encoding.</param>
        /// <param name = "fullTextIndexType">Full type of the text index.</param>
        /// <param name = "extractWebPageMetaData">if set to <c>true</c> [extract web page meta data].</param>
        /// <param name = "insertWebPageMetaData">if set to <c>true</c> [insert web page meta data].</param>
        /// <param name = "saveWebPageToDisk">if set to <c>true</c> [save web page to disk].</param>
        /// <returns></returns>
        public override ManagedWebPage ManageWebPage(long webPageID, string absoluteUri, byte[] source, Encoding encoding, string fullTextIndexType, bool extractWebPageMetaData, bool insertWebPageMetaData, bool saveWebPageToDisk)
        {
            try
            {
                ManagedWebPage managedWebPage = new ManagedWebPage();

                string source2 = null;

                if (extractWebPageMetaData || saveWebPageToDisk)
                {
                    source2 = encoding.GetString(source);
                }

                if (extractWebPageMetaData)
                {
                    string source3 = HttpUtility.HtmlDecode(source2);

                    //ANODET: Enable the HtmlAgilityPack to work with bytes.
                    managedWebPage.HtmlDocument = _htmlManager.CreateHtmlDocument(source2, Encoding.Unicode);
                    managedWebPage.Tags         = UserDefinedFunctions.ExtractTags(source3).Value;
                    managedWebPage.Text         = UserDefinedFunctions.ExtractText(source3).Value;

                    #region Experimental Code comparing character parsing vs. regular expressions...

                    //bool inATag = false;

                    //StringBuilder stringBuilder = new StringBuilder();

                    //for (int i = 0; i < source3.Length; i++)
                    //{
                    //    if(source3[i] == '<')
                    //    {
                    //        inATag = true;
                    //        continue;
                    //    }

                    //    if (source3[i] == '>')
                    //    {
                    //        inATag = false;
                    //        continue;
                    //    }

                    //    if (!inATag && !char.IsControl(source3[i]))
                    //    {
                    //        stringBuilder.Append(source3[i]);
                    //    }
                    //}

                    //managedWebPage.Text = stringBuilder.ToString();

                    #endregion

                    if (insertWebPageMetaData)
                    {
                        _arachnodeDAO.InsertWebPageMetaData(webPageID, absoluteUri, encoding.GetBytes(managedWebPage.Text), managedWebPage.HtmlDocument.DocumentNode.OuterHtml);
                    }
                }

                if (saveWebPageToDisk)
                {
                    managedWebPage.DiscoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedWebPagesDirectory, absoluteUri, fullTextIndexType);

                    managedWebPage.StreamWriter = new StreamWriter(managedWebPage.DiscoveryPath, false, encoding);

                    managedWebPage.StreamWriter.Write(source2);
                }

                return(managedWebPage);
            }
            catch (Exception exception)
            {
                //ANODET: Long paths...
#if !DEMO
                _arachnodeDAO.InsertException(absoluteUri, null, exception, false);
#endif
            }

            return(null);
        }
示例#8
0
        public static string Summarize(Query query, Query wildcardSafeQuery, bool shouldDocumentsBeClustered, string text)
        {
            int fragmentLength = 150;

            StandardAnalyzer standardAnalyzer = new StandardAnalyzer();

            Highlighter highligher = new Highlighter(new QueryScorer(query));

            highligher.SetTextFragmenter(new SimpleFragmenter(fragmentLength));

            string text2 = UserDefinedFunctions.ExtractText(text).Value;

            TokenStream tokenStream = standardAnalyzer.TokenStream("text", new StringReader(text2));

            string bestFragments = (highligher.GetBestFragments(tokenStream, text2, 1, "...") + " ...").TrimStart(" ,".ToCharArray());

            if (bestFragments == "...")
            {
                text = HttpUtility.HtmlEncode(text);

                tokenStream = standardAnalyzer.TokenStream("text", new StringReader(text));

                bestFragments = (highligher.GetBestFragments(tokenStream, text, 1, "...") + " ...").TrimStart(" ,".ToCharArray());

                if (bestFragments == "...")
                {
                    Hashtable hashTable = new Hashtable();

                    try
                    {
                        query.ExtractTerms(hashTable);
                    }
                    catch
                    {
                        try
                        {
                            wildcardSafeQuery.ExtractTerms(hashTable);
                        }
                        catch
                        {
                        }
                    }

                    if (hashTable.Count != 0)
                    {
                        string firstTerm = null;

                        foreach (Term term in hashTable.Values)
                        {
                            if (term.Field() == "text")
                            {
                                string termText = term.Text();

                                if (termText != null)
                                {
                                    firstTerm = termText.Split(' ')[0];

                                    break;
                                }
                            }
                        }

                        if (firstTerm != null)
                        {
                            int index = text.ToLowerInvariant().IndexOf(firstTerm);

                            if (index != -1)
                            {
                                if (index + fragmentLength > text.Length)
                                {
                                    fragmentLength = text.Length - index;
                                }

                                bestFragments = Regex.Replace(text.Substring(index, fragmentLength), firstTerm, "<b>" + firstTerm + "</b>", RegexOptions.IgnoreCase) + "...";
                            }
                        }
                    }
                }
            }

            return(bestFragments);
        }
示例#9
0
        /// <summary>
        ///     Performs the action.
        /// </summary>
        /// <param name = "crawlRequest">The crawl request.</param>
        /// <param name = "arachnodeDAO">The arachnode DAO.</param>
        public override void PerformAction(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO)
        {
            //use this instead: http://code.google.com/p/boilerpipe/

            /*if (!crawlRequest.ProcessData)
             * {
             *  return;
             * }*/

            if (crawlRequest.DataType.DiscoveryType == DiscoveryType.WebPage)
            {
                if (crawlRequest.Data != null)
                {
                    ManagedWebPage managedWebPage = ((ManagedWebPage)crawlRequest.ManagedDiscovery);

                    if (managedWebPage.HtmlDocument == null)
                    {
                        managedWebPage.HtmlDocument = crawlRequest.Crawl.Crawler.HtmlManager.CreateHtmlDocument(crawlRequest.Html, Encoding.Unicode);
                    }

                    IDictionary <string, XPathInfo> xPathInfos = new Dictionary <string, XPathInfo>();

                    xPathInfos = GenerateXPaths(managedWebPage.HtmlDocument.DocumentNode, string.Empty, xPathInfos);

                    //string dateXPath = ExtractDateXPath(htmlDocument1, xpathInfos);

                    //List<string> dates = htmlDocument1.DocumentNode.SelectNodes(dateXPath).OfType<HtmlNode>().Select(h => h.InnerText).ToList();

                    ProcessXPaths(xPathInfos);

                    List <XPathInfo> xPathInfos2 = xPathInfos.Values.OrderByDescending(x => x.LevenstheinDistance).ToList();

                    int numberOfSlashes = 0;

                    IDictionary <string, XPathInfo> xPathInfos3 = new Dictionary <string, XPathInfo>();

                    int xPaths = 0;
                    int minimumNumberOfXPaths = 5;

                    foreach (XPathInfo xPathInfo in xPathInfos2)
                    {
                        int numberOfSlashes2 = xPathInfo.XPath.Length - xPathInfo.XPath.Replace("/", string.Empty).Length;

                        if (numberOfSlashes2 > numberOfSlashes)
                        {
                            numberOfSlashes = numberOfSlashes2;

                            xPathInfos3.Add(xPathInfo.XPath, xPathInfo);
                        }
                        else
                        {
                            if (xPaths++ > minimumNumberOfXPaths)
                            {
                                break;
                            }
                        }
                    }

                    StringBuilder stringBuilder = new StringBuilder();

                    Dictionary <string, XPathInfo> dictionary = new Dictionary <string, XPathInfo>();

                    foreach (XPathInfo xPathInfo in xPathInfos3.Values)
                    {
                        //stringBuilder.Remove(0, stringBuilder.Length);

                        foreach (HtmlNode htmlNode in managedWebPage.HtmlDocument.DocumentNode.SelectNodes(xPathInfo.XPath))
                        {
                            string text = UserDefinedFunctions.ExtractText(htmlNode.InnerHtml).Value;

                            if (!dictionary.ContainsKey(text))
                            {
                                XPathInfo xPathInfo2 = new XPathInfo();

                                xPathInfo2.XPath = xPathInfo.XPath;

                                dictionary.Add(text, xPathInfo2);
                            }

                            dictionary[text].Count++;
                        }
                    }

                    Dictionary <string, XPathInfo> dictionary2 = new Dictionary <string, XPathInfo>();

                    foreach (KeyValuePair <string, XPathInfo> keyValuePair in dictionary)
                    {
                        if (!string.IsNullOrEmpty(keyValuePair.Key.Trim()))
                        {
                            dictionary2.Add(keyValuePair.Key, keyValuePair.Value);
                        }
                    }

                    foreach (string key in dictionary.Keys)
                    {
                        foreach (string key2 in dictionary.Keys)
                        {
                            if (!string.IsNullOrEmpty(key.Trim()) && !string.IsNullOrEmpty(key2.Trim()))
                            {
                                if (key.Contains(key2) || key2.Contains(key))
                                {
                                    dictionary2[key].Count++;
                                    dictionary2[key2].Count++;
                                }
                            }
                        }
                    }

                    int dictionary2Max = dictionary2.Max(d => d.Value.Count);

                    foreach (KeyValuePair <string, XPathInfo> keyValuePair in dictionary2)
                    {
                        if (keyValuePair.Value.Count == dictionary2Max)
                        {
                            foreach (HtmlNode htmlNode in managedWebPage.HtmlDocument.DocumentNode.SelectNodes(keyValuePair.Value.XPath))
                            {
                                stringBuilder.Append(UserDefinedFunctions.ExtractText(htmlNode.InnerHtml).Value);
                            }
                        }
                    }

                    MessageBox.Show(stringBuilder.ToString());

                    //return stringBuilder.ToString();
                }
            }
        }