public static string ExtractInnerText(string input, string startString, string endString) { string inputToLowerInvariant = input.ToLowerInvariant(); startString = startString.ToLowerInvariant(); int rawIndex = inputToLowerInvariant.IndexOf(startString.Replace("|", "")); if (rawIndex == -1) { rawIndex = inputToLowerInvariant.IndexOf(startString.Replace("|", "'")); } if (rawIndex == -1) { rawIndex = inputToLowerInvariant.IndexOf(startString.Replace("|", "\"")); } if (rawIndex != -1) { int rawIndexEnd = inputToLowerInvariant.IndexOf(endString.Replace("|", ""), rawIndex); if (rawIndexEnd == -1) { rawIndexEnd = inputToLowerInvariant.IndexOf(endString.Replace("|", "'")); } if (rawIndexEnd == -1) { rawIndexEnd = inputToLowerInvariant.IndexOf(endString.Replace("|", "\"")); } if (rawIndexEnd != -1) { if (rawIndexEnd - rawIndex >= 1) { string raw = input.Substring(rawIndex, rawIndexEnd - rawIndex); string rawWithHyperLinks = raw; rawWithHyperLinks = EncapsulateHyperLinks(rawWithHyperLinks); raw = UserDefinedFunctions.ExtractText("<" + raw + ">").Value.Trim().TrimStart('<').TrimEnd('>').Trim().TrimStart('<').TrimEnd('>'); rawWithHyperLinks = UserDefinedFunctions.ExtractText("<" + rawWithHyperLinks + ">").Value.Trim().TrimStart('<').TrimEnd('>').Trim().TrimStart('<').TrimEnd('>'); return(raw + "|" + rawWithHyperLinks); } else { } } } return(null); }
public static string Summarize(Query query, bool shouldDocumentsBeClustered, string discoveryPath, Encoding encoding, Cache cache) { StandardAnalyzer standardAnalyzer = new StandardAnalyzer(); Highlighter highligher = new Highlighter(new QueryScorer(query)); highligher.SetTextFragmenter(new SimpleFragmenter(150)); string text = UserDefinedFunctions.ExtractText(File.ReadAllText(discoveryPath, encoding)).Value; TokenStream tokenStream = standardAnalyzer.TokenStream("text", new StringReader(text)); return((highligher.GetBestFragments(tokenStream, text, 1, "...") + " ...").TrimStart(" ,".ToCharArray())); }
/// <summary> /// Performs the action. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> public override void PerformAction(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { if (crawlRequest.IsDisallowed || !crawlRequest.ProcessData || crawlRequest.WebClient.WebException != null) { return; } if (crawlRequest.DataType.DiscoveryType == DiscoveryType.WebPage) { if (crawlRequest.Data != null) { //uncomment to see what was analyzed... string text = UserDefinedFunctions.ExtractText(crawlRequest.DecodedHtml).Value; double class1Classification = _class1BayesianClassifier.Classify(crawlRequest.DecodedHtml); double class2Classification = _class2BayesianClassifier.Classify(crawlRequest.DecodedHtml); byte @class = DetermineClass(class1Classification, class2Classification, true); double class1ClassificationFalsePositive = _class1BayesianClassifierFalsePositive.Classify(crawlRequest.DecodedHtml); double class2ClassificationFalsePositive = _class2BayesianClassifierFalsePositive.Classify(crawlRequest.DecodedHtml); switch (@class) { case 1: if (class1ClassificationFalsePositive >= class1Classification) { @class = 0; } else { } break; case 2: if (class2ClassificationFalsePositive >= class2Classification) { @class = 0; } else { } break; } } } }
private static IDictionary <string, XPathInfo> GenerateXPaths(HtmlNode htmlNode, string xpath, IDictionary <string, XPathInfo> xpathInfos) { if (htmlNode.NodeType == HtmlNodeType.Element) { xpath += "/" + htmlNode.Name; if (_allowedTagNames.Contains(htmlNode.Name) && !xpathInfos.ContainsKey(xpath)) { string innerText = UserDefinedFunctions.ExtractText(htmlNode.InnerHtml).Value; if (!string.IsNullOrEmpty(innerText.Trim())) { XPathInfo xPathInfo = new XPathInfo(); xPathInfo.InnerText = innerText; xPathInfo.Tag = htmlNode.Name; xPathInfo.XPath = xpath; xpathInfos.Add(xpath, xPathInfo); } } if (xpathInfos.ContainsKey(xpath)) { xpathInfos[xpath].Count++; } Debug.Print(xpath); } foreach (HtmlNode childNode in htmlNode.ChildNodes) { if (childNode.NodeType == HtmlNodeType.Element) { GenerateXPaths(childNode, xpath, xpathInfos); } } return(xpathInfos); }
private void btnEvaluateXPath_Click(object sender, EventArgs e) { try { Evaluate evaluate = new Evaluate(); evaluate.dataGridView1.Rows.Clear(); //foreach (HtmlNode htmlNode in _htmlDocument.DocumentNode.SelectSingleNode(tbXPath.Text)) //{ // evaluate.dataGridView1.Rows.Add(UserDefinedFunctions.ExtractText(htmlNode.InnerHtml).Value); //} string innerHtml = _htmlDocument.DocumentNode.SelectSingleNode(tbXPath.Text).InnerHtml; evaluate.dataGridView1.Rows.Add(UserDefinedFunctions.ExtractText(innerHtml).Value); evaluate.ShowDialog(this); } catch (Exception exception) { MessageBox.Show(exception.Message + Environment.NewLine + exception.StackTrace, _formText); } }
void htmlElement_Click(object sender, HtmlElementEventArgs e) { if (wbBrowser.Document.ActiveElement != null) { foreach (HtmlNode htmlNode in _htmlDocument.DocumentNode.DescendantsAndSelf()) { if (htmlNode.GetAttributeValue("arachnode_scraper_id", string.Empty) == wbBrowser.Document.ActiveElement.TabIndex.ToString()) { TreeNode treeNode = _treeNodes.Where(tn => (HtmlElement)tn.Tag == wbBrowser.Document.ActiveElement).First(); tvBrowser.SelectedNode = treeNode; tvBrowser.HideSelection = false; tbXPath.Text = htmlNode.XPath; tbResult.Text = null; if (!string.IsNullOrEmpty(wbBrowser.Document.ActiveElement.InnerHtml)) { tbResult.Text = UserDefinedFunctions.ExtractText(wbBrowser.Document.ActiveElement.InnerHtml).Value; } } } } }
/// <summary> /// Manages the web page. /// </summary> /// <param name = "webPageID">The web page ID.</param> /// <param name = "absoluteUri">The absolute URI.</param> /// <param name = "source">The source.</param> /// <param name = "encoding">The encoding.</param> /// <param name = "fullTextIndexType">Full type of the text index.</param> /// <param name = "extractWebPageMetaData">if set to <c>true</c> [extract web page meta data].</param> /// <param name = "insertWebPageMetaData">if set to <c>true</c> [insert web page meta data].</param> /// <param name = "saveWebPageToDisk">if set to <c>true</c> [save web page to disk].</param> /// <returns></returns> public override ManagedWebPage ManageWebPage(long webPageID, string absoluteUri, byte[] source, Encoding encoding, string fullTextIndexType, bool extractWebPageMetaData, bool insertWebPageMetaData, bool saveWebPageToDisk) { try { ManagedWebPage managedWebPage = new ManagedWebPage(); string source2 = null; if (extractWebPageMetaData || saveWebPageToDisk) { source2 = encoding.GetString(source); } if (extractWebPageMetaData) { string source3 = HttpUtility.HtmlDecode(source2); //ANODET: Enable the HtmlAgilityPack to work with bytes. managedWebPage.HtmlDocument = _htmlManager.CreateHtmlDocument(source2, Encoding.Unicode); managedWebPage.Tags = UserDefinedFunctions.ExtractTags(source3).Value; managedWebPage.Text = UserDefinedFunctions.ExtractText(source3).Value; #region Experimental Code comparing character parsing vs. regular expressions... //bool inATag = false; //StringBuilder stringBuilder = new StringBuilder(); //for (int i = 0; i < source3.Length; i++) //{ // if(source3[i] == '<') // { // inATag = true; // continue; // } // if (source3[i] == '>') // { // inATag = false; // continue; // } // if (!inATag && !char.IsControl(source3[i])) // { // stringBuilder.Append(source3[i]); // } //} //managedWebPage.Text = stringBuilder.ToString(); #endregion if (insertWebPageMetaData) { _arachnodeDAO.InsertWebPageMetaData(webPageID, absoluteUri, encoding.GetBytes(managedWebPage.Text), managedWebPage.HtmlDocument.DocumentNode.OuterHtml); } } if (saveWebPageToDisk) { managedWebPage.DiscoveryPath = _discoveryManager.GetDiscoveryPath(ApplicationSettings.DownloadedWebPagesDirectory, absoluteUri, fullTextIndexType); managedWebPage.StreamWriter = new StreamWriter(managedWebPage.DiscoveryPath, false, encoding); managedWebPage.StreamWriter.Write(source2); } return(managedWebPage); } catch (Exception exception) { //ANODET: Long paths... #if !DEMO _arachnodeDAO.InsertException(absoluteUri, null, exception, false); #endif } return(null); }
public static string Summarize(Query query, Query wildcardSafeQuery, bool shouldDocumentsBeClustered, string text) { int fragmentLength = 150; StandardAnalyzer standardAnalyzer = new StandardAnalyzer(); Highlighter highligher = new Highlighter(new QueryScorer(query)); highligher.SetTextFragmenter(new SimpleFragmenter(fragmentLength)); string text2 = UserDefinedFunctions.ExtractText(text).Value; TokenStream tokenStream = standardAnalyzer.TokenStream("text", new StringReader(text2)); string bestFragments = (highligher.GetBestFragments(tokenStream, text2, 1, "...") + " ...").TrimStart(" ,".ToCharArray()); if (bestFragments == "...") { text = HttpUtility.HtmlEncode(text); tokenStream = standardAnalyzer.TokenStream("text", new StringReader(text)); bestFragments = (highligher.GetBestFragments(tokenStream, text, 1, "...") + " ...").TrimStart(" ,".ToCharArray()); if (bestFragments == "...") { Hashtable hashTable = new Hashtable(); try { query.ExtractTerms(hashTable); } catch { try { wildcardSafeQuery.ExtractTerms(hashTable); } catch { } } if (hashTable.Count != 0) { string firstTerm = null; foreach (Term term in hashTable.Values) { if (term.Field() == "text") { string termText = term.Text(); if (termText != null) { firstTerm = termText.Split(' ')[0]; break; } } } if (firstTerm != null) { int index = text.ToLowerInvariant().IndexOf(firstTerm); if (index != -1) { if (index + fragmentLength > text.Length) { fragmentLength = text.Length - index; } bestFragments = Regex.Replace(text.Substring(index, fragmentLength), firstTerm, "<b>" + firstTerm + "</b>", RegexOptions.IgnoreCase) + "..."; } } } } } return(bestFragments); }
/// <summary> /// Performs the action. /// </summary> /// <param name = "crawlRequest">The crawl request.</param> /// <param name = "arachnodeDAO">The arachnode DAO.</param> public override void PerformAction(CrawlRequest <TArachnodeDAO> crawlRequest, IArachnodeDAO arachnodeDAO) { //use this instead: http://code.google.com/p/boilerpipe/ /*if (!crawlRequest.ProcessData) * { * return; * }*/ if (crawlRequest.DataType.DiscoveryType == DiscoveryType.WebPage) { if (crawlRequest.Data != null) { ManagedWebPage managedWebPage = ((ManagedWebPage)crawlRequest.ManagedDiscovery); if (managedWebPage.HtmlDocument == null) { managedWebPage.HtmlDocument = crawlRequest.Crawl.Crawler.HtmlManager.CreateHtmlDocument(crawlRequest.Html, Encoding.Unicode); } IDictionary <string, XPathInfo> xPathInfos = new Dictionary <string, XPathInfo>(); xPathInfos = GenerateXPaths(managedWebPage.HtmlDocument.DocumentNode, string.Empty, xPathInfos); //string dateXPath = ExtractDateXPath(htmlDocument1, xpathInfos); //List<string> dates = htmlDocument1.DocumentNode.SelectNodes(dateXPath).OfType<HtmlNode>().Select(h => h.InnerText).ToList(); ProcessXPaths(xPathInfos); List <XPathInfo> xPathInfos2 = xPathInfos.Values.OrderByDescending(x => x.LevenstheinDistance).ToList(); int numberOfSlashes = 0; IDictionary <string, XPathInfo> xPathInfos3 = new Dictionary <string, XPathInfo>(); int xPaths = 0; int minimumNumberOfXPaths = 5; foreach (XPathInfo xPathInfo in xPathInfos2) { int numberOfSlashes2 = xPathInfo.XPath.Length - xPathInfo.XPath.Replace("/", string.Empty).Length; if (numberOfSlashes2 > numberOfSlashes) { numberOfSlashes = numberOfSlashes2; xPathInfos3.Add(xPathInfo.XPath, xPathInfo); } else { if (xPaths++ > minimumNumberOfXPaths) { break; } } } StringBuilder stringBuilder = new StringBuilder(); Dictionary <string, XPathInfo> dictionary = new Dictionary <string, XPathInfo>(); foreach (XPathInfo xPathInfo in xPathInfos3.Values) { //stringBuilder.Remove(0, stringBuilder.Length); foreach (HtmlNode htmlNode in managedWebPage.HtmlDocument.DocumentNode.SelectNodes(xPathInfo.XPath)) { string text = UserDefinedFunctions.ExtractText(htmlNode.InnerHtml).Value; if (!dictionary.ContainsKey(text)) { XPathInfo xPathInfo2 = new XPathInfo(); xPathInfo2.XPath = xPathInfo.XPath; dictionary.Add(text, xPathInfo2); } dictionary[text].Count++; } } Dictionary <string, XPathInfo> dictionary2 = new Dictionary <string, XPathInfo>(); foreach (KeyValuePair <string, XPathInfo> keyValuePair in dictionary) { if (!string.IsNullOrEmpty(keyValuePair.Key.Trim())) { dictionary2.Add(keyValuePair.Key, keyValuePair.Value); } } foreach (string key in dictionary.Keys) { foreach (string key2 in dictionary.Keys) { if (!string.IsNullOrEmpty(key.Trim()) && !string.IsNullOrEmpty(key2.Trim())) { if (key.Contains(key2) || key2.Contains(key)) { dictionary2[key].Count++; dictionary2[key2].Count++; } } } } int dictionary2Max = dictionary2.Max(d => d.Value.Count); foreach (KeyValuePair <string, XPathInfo> keyValuePair in dictionary2) { if (keyValuePair.Value.Count == dictionary2Max) { foreach (HtmlNode htmlNode in managedWebPage.HtmlDocument.DocumentNode.SelectNodes(keyValuePair.Value.XPath)) { stringBuilder.Append(UserDefinedFunctions.ExtractText(htmlNode.InnerHtml).Value); } } } MessageBox.Show(stringBuilder.ToString()); //return stringBuilder.ToString(); } } }