/// <summary> /// Searches for pages with suggested name. /// </summary> /// <param name="pageName">Page name to be searched.</param> /// <returns>Uri if found, null otherwise.</returns> public Uri FindPage(string pageName) { HtmlProcessor proc = new HtmlProcessor(this.DocumentHtml); foreach (var item in proc.Links) { if (item.Attributes["href"].IndexOf(pageName, StringComparison.OrdinalIgnoreCase) != -1) { return(new Uri(item.Attributes["href"].FixUrl(_uri))); } } return(null); }
/// <summary> /// Extracts links. /// </summary> /// <param name="uri">Uri to be processed.</param> /// <returns>Extracted links.</returns> protected List <DomElement> GetExtractedLinks(Uri uri) { List <DomElement> links = new List <DomElement>(); UriHtmlExtractor uriProc = new UriHtmlExtractor(uri, proxy: _proxy); HtmlProcessor proc = new HtmlProcessor(uriProc.DocumentHtml); switch (_engine) { case SearchEngines.Google: links = proc.Links.Where( l => l.Attributes["href"].Contains("google.") == false && l.Attributes["href"].Contains("cache:") == false && l.Attributes["href"].Contains("related:") == false ).ToList(); break; case SearchEngines.Yandex: links = proc.Links.Where( l => l.Attributes["href"] != null && l.Attributes["href"].Contains("yandex.") == false && l.Attributes["href"].Contains("ya.ru") == false && l.Attributes["href"].Contains("moikrug") == false ).ToList(); break; case SearchEngines.Bing: links = proc.Links.Where( l => l.Attributes["href"] != null && l.Attributes["href"].Contains("bingj.") == false && l.Attributes["href"].Contains("msn.") == false && l.Attributes["href"].Contains("live.") == false && l.Attributes["href"].Contains("bing.") == false && l.Attributes["href"].Contains("google.com") == false && l.Attributes["href"].Contains("go.microsoft.com") == false && l.Attributes["href"].Contains("microsofttranslator") == false ).ToList(); break; default: break; } links = links.Where(l => Uri.IsWellFormedUriString(l.Attributes["href"], UriKind.Absolute)).ToList(); return(links); }
/// <summary> /// Gets extracted data. /// </summary> public List <ExtractedItemInfo> GetExtractedResults() { _htmlProcessor = new HtmlProcessor(_innerHtml); _regexStrings = new List <DataTypes>(); _results = new List <ExtractedItemInfo>(); //if ((_dataTypesToExtract & DataTypes.All) == DataTypes.None) //{ // throw new ArgumentException("DataTypesToExtract can not be undefined!"); //} foreach (string dtName in Enum.GetNames(typeof(DataTypes))) { DataTypes dataType = (DataTypes)Enum.Parse(typeof(DataTypes), dtName); if (dataType == DataTypes.All) { continue; } if ((_dataTypesToExtract & dataType) != 0) { _regexStrings.Add(dataType); } dataType = DataTypes.None; } foreach (var regex in this._regexStrings) { this.AddCustomRegex(regex.GetStringValue(), regex.ToString().Trim()); } foreach (var regex in _customRegexStrings.Keys) { string groupName = string.Empty; _customRegexStrings.TryGetValue(regex, out groupName); this.AddCustomRegex((string)regex, groupName); } return(_results); }