public List <LinkItem> FindHtmlAgility(string filesource) { List <LinkItem> list = new List <LinkItem>(); HtmlAgilityPack.HtmlWeb htmlweb = new HtmlWeb(); try { HtmlDocument doc = htmlweb.Load("file://" + filesource); var node = doc.DocumentNode.SelectNodes("//a"); foreach (var nd in node) { if (nd.Attributes["href"] != null && nd.Attributes["href"].Value.Length > 0 && nd.Attributes["href"].Value.StartsWith(@"wiki/[\w]+") == true) { LinkItem ln = new LinkItem(nd.Attributes["title"].Value, nd.Attributes["href"].Value); if (list.Contains(ln) == false) { list.Add(ln); } } } } catch { } return(list); }
public Dictionary <String, String> retrieveResults(List <String> queryParts, out Boolean Did_you_mean) { Dictionary <String, String> ResultSet = new Dictionary <string, string>(); Dictionary <String, titleHits> BestMatches = new Dictionary <String, titleHits>(); LinkItem mostValuable = null; BestMatches = findBestHitPage(queryParts, out mostValuable, out Did_you_mean); if (mostValuable.Href != null && mostValuable.Href != "") { ResultSet.Add(mostValuable.Text, mostValuable.Href); } else { foreach (var match in BestMatches) { mostValuable.Text = match.Key; mostValuable.Href = match.Value.Link; break; } } foreach (var match in BestMatches) { try { ResultSet.Add(match.Key, match.Value.Link); } catch { continue; } } //if (BestMatches.Count > 0) //{gg Dictionary <string, string> relatedLinks = new Dictionary <string, string>(); relatedLinks = findRelatedLinks2(mostValuable.Text); if (relatedLinks.Count > 0) { foreach (var pair in relatedLinks) { try { ResultSet.Add(pair.Key, pair.Value); } catch { continue; } } } //} return(ResultSet); }
public List <LinkItem> Find(string fileSource) { List <LinkItem> list = new List <LinkItem>(); // 1. // Find all <a>.....</a> matches in file. MatchCollection aTag = Regex.Matches(fileSource, @"(<a.*?>.*?</a>)", RegexOptions.Singleline); // 2. // Loop over each match. foreach (Match m in aTag) { String value = m.Groups[1].Value; String link; LinkItem item = new LinkItem("", ""); // 3. // Get href attribute. Match m2 = Regex.Match(value, @"href=\""(.*?)\""", RegexOptions.Singleline); if (m2.Success) { link = m2.Groups[1].Value; if (link.StartsWith("#") == true || link.Contains("(") || link.Contains("%")) { continue; } else if (System.Text.RegularExpressions.Regex.IsMatch(link, "[A-Z][a-z]*:")) { continue; } else if (link.StartsWith("/wiki") == true) { item.Href = "http://en.wikipedia.org" + link; // 4. //Extracting link title Match m3 = Regex.Match(value, @"title=\""(.*?)\""", RegexOptions.Singleline); if (m3.Success) { item.Text = m3.Groups[1].Value; } list.Add(item); } } } return(list); }
public Dictionary <String, titleHits> findBestHitPage(List <String> queryTerms, out LinkItem mostValuable, out Boolean Did_you_mean) { Dictionary <String, titleHits> exactWordsMatch = new Dictionary <string, titleHits>(); Dictionary <String, titleHits> DerivedWordmatch = new Dictionary <String, titleHits>(); mostValuable = new LinkItem(); LinkItem SimilarMatch = null;; Boolean impDone = false; Boolean SimDone = false; float benchmark = 0.45F; Did_you_mean = false; foreach (var result in LR.LoadedResultsSet1) //.OrderByDescending(key => key.Value) { int success = 0; foreach (var queryterm in queryTerms) { String term = queryterm; if (Regex.IsMatch(result.Key, term, RegexOptions.IgnoreCase) == true) { success++; } else { float value = CompareSimilarMatch(term, result.Key); if (value < benchmark) { benchmark = value; SimilarMatch = new LinkItem(); SimilarMatch.Href = result.Value.Link; SimilarMatch.Text = result.Key; SimDone = true; success++; } } } if (success == queryTerms.Count) { SearchQuality decision = CompareTitleSuccess(queryTerms, result.Key); if (decision == SearchQuality.BETTER) { exactWordsMatch.Add(result.Key, new titleHits(result.Value)); } else if (decision == SearchQuality.GOOD) { DerivedWordmatch.Add(result.Key, new titleHits(result.Value)); } else if (decision == SearchQuality.BEST) { if (impDone == false) { mostValuable.Text = result.Key; mostValuable.Href = result.Value.Link; impDone = true; } } } } foreach (var match in DerivedWordmatch) { exactWordsMatch.Add(match.Key, new titleHits(match.Value)); } bool b1 = String.IsNullOrEmpty(mostValuable.Href); bool b2 = String.IsNullOrEmpty(mostValuable.Text); if (b1 && b2 && SimDone == true && impDone == false) { mostValuable.Href = SimilarMatch.Href; mostValuable.Text = SimilarMatch.Text; Did_you_mean = true; } return(exactWordsMatch); }