private static bool ContainsEmbed(string input) { IElementPredicate predicate = new OrPredicate(new BeginTagPredicate("embed"), new BeginTagPredicate("object")); HtmlExtractor ex = new HtmlExtractor(input); return(ex.Seek(predicate).Success); }
public string RestorePreserved(string html) { StringBuilder sb = new StringBuilder(); HtmlExtractor ex = new HtmlExtractor(html); int pos = 0; while (ex.Seek("<span class='" + PRESERVE_CLASS + "'>").Success) { sb.Append(html, pos, ex.Element.Offset - pos); pos = ex.Element.Offset; BeginTag bt = (BeginTag)ex.Element; string elementId = bt.GetAttributeValue("id"); Match m = Regex.Match(elementId ?? "", @"^preserve([a-zA-Z0-9]+)$"); if (m.Success) { string preserveId = m.Groups[1].Value; string preservedValue; if (preserved.TryGetValue(preserveId, out preservedValue)) { sb.Append(preservedValue); ex.CollectTextUntil("span"); if (ex.Element == null) { pos = html.Length; } else { pos = ex.Parser.Position; } } } } sb.Append(html, pos, html.Length - pos); return(sb.ToString()); }
Size FindSizeAttribute(string input) { Size size = new Size(_width, _height); if (string.IsNullOrEmpty(input)) { return(size); } try { RequiredAttribute[] attrWidth = new RequiredAttribute[] { new RequiredAttribute("width"), new RequiredAttribute("height") }; IElementPredicate predicate = new OrPredicate(new BeginTagPredicate("embed", attrWidth), new BeginTagPredicate("object", attrWidth)); HtmlExtractor ex = new HtmlExtractor(input); if (ex.Seek(predicate).Success) { BeginTag tag = (BeginTag)ex.Element; size = new Size(Convert.ToInt32(tag.GetAttributeValue("width"), CultureInfo.InvariantCulture), Convert.ToInt32(tag.GetAttributeValue("height"), CultureInfo.InvariantCulture)); } } catch (Exception ex) { Trace.Fail("Exception thrown while trying to find video size: " + ex); } return(size); }
public bool IsBlogger() { if (Regex.IsMatch(homepageUrl, @"^http://.+\.blogspot\.com($|/)", RegexOptions.IgnoreCase) || Regex.IsMatch(homepageUrl, @"^http(s)?://(www\.)?blogger\.com($|/)", RegexOptions.IgnoreCase) || new HtmlExtractor(html).Seek(new BloggerGeneratorCriterion()).Success) { return(true); } HtmlExtractor ex = new HtmlExtractor(html); while (ex.Seek("<link href rel='service.post' type='application/atom+xml'>").Success) { BeginTag bt = (BeginTag)ex.Element; string atomHref = bt.GetAttributeValue("href"); // these obsolete Blogger atom links can't be used, but are // still a good indication that it's Blogger if (atomHref.StartsWith("https://www.blogger.com/atom/", StringComparison.OrdinalIgnoreCase)) { return(true); } // any other blogger or blogspot atom link will be considered a match if (Regex.IsMatch(atomHref, @"^https?\:\/\/.+\.blog(ger|spot)\.com\/.*", RegexOptions.IgnoreCase)) { return(true); } } return(false); }
public static ImageViewer DetectImageViewer(string html, string sourceUrl) { List <ImageViewer> viewers = imageViewers; LazyLoader <List <Regex> > regexes = new LazyLoader <List <Regex> >(delegate { List <Regex> regexList = new List <Regex>(viewers.Count); foreach (ImageViewer v in viewers) { regexList.Add(new Regex(v.Pattern, RegexOptions.CultureInvariant)); } return(regexList); }); HtmlExtractor ex = new HtmlExtractor(html); while (ex.Seek("<script src>").Success) { BeginTag tag = (BeginTag)ex.Element; string src = tag.GetAttributeValue("src"); if (String.IsNullOrEmpty(src)) { continue; } try { if (!UrlHelper.IsUrl(src)) { // We need absolute URLs. src = UrlHelper.EscapeRelativeURL(sourceUrl, src); } Uri srcUri = new Uri(src); if (srcUri.IsAbsoluteUri) { // WinLive 248276: We want just the path portion since there could be an additional query or // fragment on the URL that our regexs can't handle. src = srcUri.GetLeftPart(UriPartial.Path); } } catch (UriFormatException) { // We'll just use the regex on the raw attribute value. } List <Regex> regexList = regexes.Value; for (int i = 0; i < regexList.Count; i++) { if (regexList[i].IsMatch(src)) { return(viewers[i]); } } } return(null); }
public Match MatchHomepageText(Regex regex) { string html = DownloadHomepage(); HtmlExtractor ex = new HtmlExtractor(html); if (ex.Seek(new DelegatePredicate(e => e is Text && regex.IsMatch(e.ToString()))).Success) { return(regex.Match(ex.Element.ToString())); } return(Match.Empty); }
private static bool ContainsEmbedOrObject(string html) { try { IElementPredicate predicate = new OrPredicate(new BeginTagPredicate("embed"), new BeginTagPredicate("object"), new BeginTagPredicate("iframe")); HtmlExtractor ex = new HtmlExtractor(html); return(ex.Seek(predicate).Success); } catch { return(false); } }
private IExtractionResult _Extract(Stream templateStream, Stream source) { IExtractionResult result = null; try { HtmlExtractor _Template = HtmlExtractor.Load(templateStream); result = _Template.Extract(source); } catch (Exception e) { Console.WriteLine(e.StackTrace); } return(result); }
public void ContentParseTest() { string content = "China and India are making the planet greener, NASA says"; HtmlExtractor htmlExtract = new HtmlExtractor(); htmlExtract.SetHtml($"<div><h1 class=\"pg-headline\">{content}</h1></div>"); string extContent = htmlExtract.ParseFirst(new XpathExtractModel() { ExtractType = ExtractType.Text, XpathRule = "//h1[@class='pg-headline']", XpathEndAttributes = null }); Assert.True(extContent == content); }
public bool ShouldUpdateContent(string oldHTML, string newHTML) { HtmlExtractor exOld = new HtmlExtractor(oldHTML); HtmlExtractor exNew = new HtmlExtractor(newHTML); HtmlExtractor exImgOld = exOld.Seek("<img title>"); HtmlExtractor exImgNew = exNew.Seek("<img title>"); if (exImgOld.Success && exImgNew.Success && ((BeginTag)exImgOld.Element).GetAttributeValue("title") == ((BeginTag)exImgNew.Element).GetAttributeValue("title")) { return(false); } return(true); }
public override BlogPost Parse(XmlElement entryNode, bool includeCategories, Uri documentUri) { BlogPost post = new BlogPost(); AtomEntry atomEntry = new AtomEntry(_atomVer, _atomNS, CategoryScheme, _nsMgr, documentUri, entryNode); post.Title = atomEntry.Title; post.Excerpt = atomEntry.Excerpt; post.Id = PostUriToPostId(atomEntry.EditUri); post.Permalink = atomEntry.Permalink; string content = atomEntry.ContentHtml; if (content.Trim() != string.Empty) { HtmlExtractor ex = new HtmlExtractor(content); int start, length; if (Options.SupportsExtendedEntries && ex.Seek("<a name=\"more\">").Success) { start = ex.Element.Offset; length = ex.Element.Length; if (ex.Seek("</a>").Success) { post.SetContents(content.Substring(0, start), content.Substring(ex.Element.Offset + ex.Element.Length)); } else { post.SetContents(content.Substring(0, start), content.Substring(start + length)); } } else { post.Contents = content; } } post.DatePublished = atomEntry.PublishDate; if (Options.SupportsCategories && includeCategories) { post.Categories = atomEntry.Categories; } return(post); }
// Warning: Does not deal with escaping properly. This is fine as long as // we're only using it for content we generate and there are no security // impliciations. public static string StripDivsWithClass(string html, string cssClass) { if (html.IndexOf(cssClass) < 0) { return(html); } StringBuilder sb = new StringBuilder(); HtmlExtractor ex = new HtmlExtractor(html); int pos = 0; while (ex.Seek("<div class='" + cssClass + "'>").Success) { sb.Append(html, pos, ex.Element.Offset - pos); ex.Parser.CollectHtmlUntil("div"); pos = ex.Parser.Position; } sb.Append(html, pos, html.Length - pos); return(sb.ToString()); }
public void HtmlContentParseTest() { string url = "https://ai.baidu.com"; string htmlContent = $"<div class=\"ai-nav-menu-item-list\"><a href=\"{url}\" target=\"_blank\" class=\"ai-nav-menu-item-list-item\">test001</a></div>"; HtmlExtractor htmlExtract = new HtmlExtractor(); htmlExtract.SetHtml(htmlContent); List <string> urls = htmlExtract.ParseList(new XpathExtractModel() { ExtractType = ExtractType.Text, XpathRule = "//a[@href]", XpathEndAttributes = new List <string>() { "href" } }); Assert.True(urls.Exists(e => e == url)); }
private string IdFromEmbed(string input) { foreach (EmbedPattern check in _embedPatterns) { IElementPredicate predicate = new BeginTagPredicate("embed", new RequiredAttribute[] { new RequiredAttribute(check.Attr) }); HtmlExtractor ex = new HtmlExtractor(input); ex = ex.Seek(predicate); if (ex.Success) { BeginTag bt = ex.Element as BeginTag; string srcRef = bt.GetAttributeValue(check.Attr); Match m = Regex.Match(srcRef, check.Pattern, RegexOptions.IgnoreCase); if (m.Success && m.Groups["id"].Success) { return(m.Groups["id"].Value); } } } return(String.Empty); }
public override List <Link> ParseUrls(string htmlSource) { var domain = HtmlExtractor.GetDomainName(this.Link.Uri); var regxInclude = "[^>]*" + domain + "[^>]*"; var temp = HtmlExtractor.UrlCollection(regxInclude, "", htmlSource, this.Link.Uri); var res = new List <Link>(); foreach (var t in temp) { res.Add(new Link() { Deep = 0, ParseCompleted = false, Uri = t, UriParent = this.Link.Uri, ParseInprogress = false, Id = t.UrlToHashCode() }); } return(res); }
public override List <File> ParseFiles(string htmlSource) { var domain = HtmlExtractor.GetDomainName(this.Link.Uri); var regxInclude = "[^>]*" + domain + "[^>]*"; var temp = HtmlExtractor.UrlCollectionWithHref(regxInclude, "", htmlSource, this.Link.Uri); var res = new List <File>(); foreach (var t in temp) { res.Add(new File() { DownloadCompleted = false, MergerCompleted = false, PathOnDisk = "", PathOnWeb = t.Key, Href = t.Value, DownloadInprogress = false, Id = t.Key.UrlToHashCode() }); } var src = HtmlExtractor.UrlImgCollectionWithSrc(htmlSource, this.Link.Uri); foreach (var s in src) { res.Add(new File() { DownloadCompleted = false, MergerCompleted = false, PathOnDisk = "", PathOnWeb = s.Key, Href = s.Value, DownloadInprogress = false, Id = s.Key.UrlToHashCode() }); } return(res); }
private void ParsePostContent(IXmlNode xmlNode, BlogPost blogPost) { // get raw content (decode base64 if necessary) string content; var base64Node = xmlNode.SelectSingleNode("base64"); if (base64Node != null) { byte[] contentBytes = Convert.FromBase64String(base64Node.InnerText); content = _utf8EncodingNoBOM.GetString(contentBytes); } else // no base64 encoding, just read text { content = xmlNode.InnerText; } // parse out the title and contents of the post HtmlExtractor ex = new HtmlExtractor(content); if (ex.Seek("<title>").Success) { SetPostTitleFromXmlValue(blogPost, ex.CollectTextUntil("title")); content = content.Substring(ex.Parser.Position).TrimStart('\r', '\n'); } if (content.Trim() != string.Empty) { HtmlExtractor ex2 = new HtmlExtractor(content); if (Options.SupportsExtendedEntries && ex2.Seek("<lj-cut>").Success) { blogPost.SetContents(content.Substring(0, ex2.Element.Offset), content.Substring(ex2.Element.Offset + ex2.Element.Length)); } else { blogPost.Contents = content; } } }
public bool MatchesEmbed(string input) { foreach (EmbedPattern check in _embedPatterns) { IElementPredicate predicate = new BeginTagPredicate("embed", new RequiredAttribute[] { new RequiredAttribute(check.Attr) }); HtmlExtractor ex = new HtmlExtractor(input); ex = ex.Seek(predicate); if (ex.Success) { BeginTag bt = ex.Element as BeginTag; string srcRef = bt.GetAttributeValue(check.Attr); if (!Regex.IsMatch(srcRef, check.Pattern, RegexOptions.IgnoreCase)) { return(false); } } else { return(false); //didn't find embed tag with the attr } } return(true); //found all predicates }
public string Extract(string source) { var temp = string.IsNullOrEmpty(ElementById) ? "" : HtmlExtractor.ContentByIdOrName(ElementById, source); if (string.IsNullOrEmpty(temp) && !string.IsNullOrEmpty(ElementByTagName) && !string.IsNullOrEmpty(ElementByClassName)) { temp = HtmlExtractor.ContentByTagAndClassNameAndIndex(ElementByTagName, ElementByClassName, source, ElementByIndex); } if (string.IsNullOrEmpty(temp) && !string.IsNullOrEmpty(ElementByTagName)) { temp = HtmlExtractor.ContentByTagNameAndIndex(ElementByTagName, ElementByIndex, source); } if (string.IsNullOrEmpty(temp) && !string.IsNullOrEmpty(ElementByClassName)) { temp = HtmlExtractor.ContentByClassNameAndIndex(ElementByTagName, source, ElementByIndex); } if (string.IsNullOrEmpty(temp)) { temp = HtmlExtractor.ContentByTagNameAndIndex("body", ElementByIndex, source); } ExtractedContent = temp; return(temp); }
static void Main(string[] args) { Console.WriteLine("/*---------------------------------------*/"); Console.WriteLine("/* HQ Plus Selection Tests By Joao Prado */"); Console.WriteLine("/* Task 1A - Generate JSON From File */"); Console.WriteLine("/*---------------------------------------*/"); Console.WriteLine(); Console.WriteLine(); string folder = Directory.GetCurrentDirectory(); string fileName = "task1.html"; var htmlExtractorFromFile = new HtmlExtractor(folder, fileName); var hotelModelA = htmlExtractorFromFile.GetHotelInformation(); var options = new JsonSerializerOptions { PropertyNamingPolicy = JsonNamingPolicy.CamelCase, WriteIndented = true }; var modelJson = JsonSerializer.Serialize(hotelModelA, options); Console.WriteLine("Extracted JSON from file:"); Console.WriteLine("--------------------------------------"); Console.WriteLine(modelJson); Console.WriteLine("--------------------------------------"); Console.WriteLine("Finished task 1A get JSON from file."); Console.WriteLine("Press any key to continue next task."); Console.ReadLine(); Console.WriteLine("/*---------------------------------------*/"); Console.WriteLine("/* HQ Plus Selection Tests By Joao Prado */"); Console.WriteLine("/* Task 1B - Generate JSON From String */"); Console.WriteLine("/*---------------------------------------*/"); var htmlString = File.ReadAllText(Path.Combine(folder, fileName)); var htmlExtractorFromString = new HtmlExtractor(htmlString); var hotelModelB = htmlExtractorFromString.GetHotelInformation(); modelJson = JsonSerializer.Serialize(hotelModelB, options); Console.WriteLine(); Console.WriteLine("Extracted JSON from string:"); Console.WriteLine("--------------------------------------"); Console.WriteLine(modelJson); Console.WriteLine("--------------------------------------"); Console.WriteLine("Finished task 1B get JSON from string."); Console.WriteLine("Press any key to continue next task."); Console.ReadLine(); Console.WriteLine("/*---------------------------------------*/"); Console.WriteLine("/* HQ Plus Selection Tests By Joao Prado */"); Console.WriteLine("/* Task 1C - Generate JSON From Stream */"); Console.WriteLine("/*---------------------------------------*/"); Console.WriteLine(); using (var fileStream = new FileStream(Path.Combine(folder, fileName), FileMode.Open, FileAccess.Read)) { var htmlExtractorFromStream = new HtmlExtractor(fileStream); var hotelModelC = htmlExtractorFromStream.GetHotelInformation(); modelJson = JsonSerializer.Serialize(hotelModelC, options); Console.WriteLine("Extracted JSON from stream:"); Console.WriteLine("--------------------------------------"); Console.WriteLine(modelJson); Console.WriteLine("--------------------------------------"); } Console.WriteLine("Finished task 1C get JSON from string."); Console.WriteLine("HQ Plus Task 1 finished."); Console.WriteLine("Press any key to exit. "); Console.ReadLine(); }
/// <summary> /// Any setting that is derived from the homepage html needs to be in this function. This function is turned /// on and off when detecting blog settings through the IncludeHomePageSettings. None of these checks will be run /// if the internet is not active. As each check is made, it does not need to be applied back the _content until the end /// at which time it will write the settings back to the registry. /// </summary> private void DetectHomePageSettings() { if (_homepageAccessor.HtmlDocument == null) { return; } IDictionary homepageSettings = new Hashtable(); Debug.Assert(!UseManifestCache, "This code will not run correctly under the manifest cache, due to option overrides not being set"); LightWeightHTMLMetaData metaData = new LightWeightHTMLMetaData(_homepageAccessor.HtmlDocument); if (metaData.Charset != null) { try { homepageSettings.Add(BlogClientOptions.CHARACTER_SET, metaData.Charset); } catch (NotSupportedException) { //not an actual encoding } } string docType = new LightWeightHTMLMetaData(_homepageAccessor.HtmlDocument).DocType; if (docType != null) { bool xhtml = docType.IndexOf("xhtml", StringComparison.OrdinalIgnoreCase) >= 0; if (xhtml) { homepageSettings.Add(BlogClientOptions.REQUIRES_XHTML, true.ToString(CultureInfo.InvariantCulture)); } } //checking whether blog is rtl HtmlExtractor extractor = new HtmlExtractor(_homepageAccessor.HtmlDocument.RawHtml); if (extractor.Seek(new OrPredicate( new SmartPredicate("<html dir>"), new SmartPredicate("<body dir>"))).Success) { BeginTag tag = (BeginTag)extractor.Element; string dir = tag.GetAttributeValue("dir"); if (String.Compare(dir, "rtl", StringComparison.OrdinalIgnoreCase) == 0) { homepageSettings.Add(BlogClientOptions.TEMPLATE_IS_RTL, true.ToString(CultureInfo.InvariantCulture)); } } if (_homepageAccessor.HtmlDocument != null) { string html = _homepageAccessor.OriginalHtml; ImageViewer viewer = DhtmlImageViewers.DetectImageViewer(html, _context.HomepageUrl); if (viewer != null) { homepageSettings.Add(BlogClientOptions.DHTML_IMAGE_VIEWER, viewer.Name); } } _context.HomePageOverrides = homepageSettings; }
public static void Main() { C1.TextParser.LicenseManager.Key = License.Key; /**************************************************Amazon template*********************************************/ Stream amazonTemplateStream = File.Open(@"amazonEmail1.html", FileMode.Open); HtmlExtractor amazonTemplate = new HtmlExtractor(amazonTemplateStream); //Repeated block for each article in the order String articleNameXPath = @"//*[@id=""shipmentDetails""]/table/tbody/tr[1]/td[2]/p/a"; amazonTemplate.AddPlaceHolder("ordered articles", "article name", articleNameXPath); String articlePriceXPath = @"//*[@id=""shipmentDetails""]/table/tbody/tr[1]/td[3]/strong"; amazonTemplate.AddPlaceHolder("ordered articles", "article price", articlePriceXPath); String articleSellerXPath = @"//*[@id=""shipmentDetails""]/table/tbody/tr[1]/td[2]/p/span"; amazonTemplate.AddPlaceHolder("ordered articles", "article seller", articleSellerXPath, 8, 18); //Fixed placeHolder for the expected delivery date String deliveryDateXPath = @"/html/body/div[2]/div/div/div/table/tbody/tr[3]/td/table/tbody/tr[1]/td[1]/p/strong"; amazonTemplate.AddPlaceHolder("delivery date", deliveryDateXPath); //Fixed placeHolder for the total amount of the order String totalAmountXPath = @"//*[@id=""shipmentDetails""]/table/tbody/tr[8]/td[2]/strong"; amazonTemplate.AddPlaceHolder("total order amount", totalAmountXPath); //Fixed placeHolder for the customer name String customerNameXPath = @"/html/body/div[2]/div/div/div/table/tbody/tr[2]/td/p[1]"; amazonTemplate.AddPlaceHolder("customer name", customerNameXPath, 6, 15); /***************************************************************************************************************/ Stream source = File.Open(@"amazonEmail2.html", FileMode.Open); IExtractionResult extractedResult = amazonTemplate.Extract(source); Console.WriteLine("------------------------------------------------------------------------------------------------------------"); Console.WriteLine("GrapeCity, inc, all rights reserved"); Console.WriteLine("Demo of the C1TextParser library - Html extractor sample"); Console.WriteLine("Test case: From amazon order emails extract relevant information about the order itself."); Console.WriteLine(" This sample pretends to demonstrate the repeated place holder extraction capabilities of"); Console.WriteLine(" C1TextParser - Html extractor"); Console.WriteLine("Detail: The sample consists on three fixed place holders and one repeated block. The fixed place holders are"); Console.WriteLine(" the customer name, the order delivery date and also the total amount of the order. The repeated "); Console.WriteLine(" block is used to extract each article that appear in the ordered article list. It contains three"); Console.WriteLine(" repeated place holders. These are: the name, the price and the seller of the article."); Console.WriteLine(" The amazon email used as the extraction source is \"amazonEmail2.html\" and can be consulted in the"); Console.WriteLine(" current working directory. Also, \"ECommerceOrder.csv\" contains the parsing result"); Console.WriteLine("------------------------------------------------------------------------------------------------------------"); Console.WriteLine("------------------------------------------------------------------------------------------------------------"); Console.WriteLine("JSon String result:"); Console.WriteLine("------------------------------------------------------------------------------------------------------------"); Console.WriteLine(extractedResult.ToJsonString()); Console.WriteLine("------------------------------------------------------------------------------------------------------------"); AmazonTemplateFixedPlaceHolders amazonTemplateFixedPlaceHolders = extractedResult.Get <AmazonTemplateFixedPlaceHolders>(); StringBuilder sb1 = CsvExportHelper.ExportList(new List <AmazonTemplateFixedPlaceHolders>() { amazonTemplateFixedPlaceHolders }); var amazonTemplateOrderedItems = extractedResult.Get <AmazonTemplateRepeatedBlocks>().OrderedItems; StringBuilder sb2 = CsvExportHelper.ExportList(amazonTemplateOrderedItems); var sb3 = sb1 + "\n" + sb2; File.WriteAllText("ECommerceOrder.csv", sb3); Console.ReadLine(); }
public static void Main() { C1.TextParser.LicenseManager.Key = License.Key; /***********************************************Vietjetair template********************************************/ Stream vietjetairTemplateStream = File.Open(@"vietjetairEmail1.html", FileMode.Open); HtmlExtractor vietjetairTemplate = new HtmlExtractor(vietjetairTemplateStream); //Fixed placeHolder for the passenger name String passengerNameXPath = @"/html/body/div/div[4]/div[1]/div[2]/div[2]/table[2]/tbody/tr[3]/td"; vietjetairTemplate.AddPlaceHolder("passenger name", passengerNameXPath); //Fixed placeHolder for the booking number String bookingNumberXPath = @"/html/body/div/div[4]/div[1]/div[2]/div[2]/table[1]/tbody/tr/td[2]/span"; vietjetairTemplate.AddPlaceHolder("booking number", bookingNumberXPath); //Fixed placeHolder for the booking status String bookingStatusXPath = @"/html/body/div/div[4]/div[1]/div[2]/div[2]/table[2]/tbody/tr[1]/td[1]"; vietjetairTemplate.AddPlaceHolder("booking status", bookingStatusXPath); //Fixed placeHolder for the fare type String fareTypeXPath = @"/html/body/div/div[4]/div[1]/div[2]/div[2]/table[4]/tbody/tr/td[3]"; vietjetairTemplate.AddPlaceHolder("fare type", fareTypeXPath); //Fixed placeHolder for total amount String totalAmountXPath = @"/html/body/div/div[4]/div[1]/div[2]/div[2]/table[6]/tbody/tr[2]/td/table[2]/tbody/tr[2]/td[3]"; vietjetairTemplate.AddPlaceHolder("total amount", totalAmountXPath); //Fixed placeHolder for city of departure String cityOfDepartureXPath = @"/html/body/div/div[4]/div[1]/div[2]/div[2]/table[4]/tbody/tr/td[4]/text()"; vietjetairTemplate.AddPlaceHolder("city of departure", cityOfDepartureXPath, 8, 12); //Fixed placeHolder for year of booking date String yearOfBookingXPath = @"/html/body/div/div[4]/div[1]/div[2]/div[2]/table[2]/tbody/tr[2]/td[1]"; vietjetairTemplate.AddPlaceHolder("year of booking", yearOfBookingXPath, 6, 4); /***************************************************************************************************************/ Stream source = File.Open(@"vietjetairEmail2.html", FileMode.Open); IExtractionResult extractedResult = vietjetairTemplate.Extract(source); Console.WriteLine("------------------------------------------------------------------------------------------------------------------"); Console.WriteLine("GrapeCity, inc, all rights reserved"); Console.WriteLine("Demo of the C1TextParser library - Html extractor sample"); Console.WriteLine("Test case: Test case: From a vietjetair e-ticket extract relevant information about the flight. Note that the"); Console.WriteLine(" email used as extraction source was modified on purpose (added random text at different locations)"); Console.WriteLine(" with the intent to show that html extractor is flexible enough to retrieve the intended text."); Console.WriteLine("Detail: This consists on seven fixed place holders. These are: the passenger name, the booking number, the"); Console.WriteLine(" booking status, the fare type, the total amount, the city of departure and, finally, the year of booking"); Console.WriteLine(" The vietjetair email used as the extraction source is \"vietjetairEmail2.html\" and can be consulted"); Console.WriteLine(" in the current working directory. Also, \"FlightETicket.csv\" contains the parsing result"); Console.WriteLine("------------------------------------------------------------------------------------------------------------------"); Console.WriteLine("------------------------------------------------------------------------------------------------------------------"); Console.WriteLine("JSon String result:"); Console.WriteLine("------------------------------------------------------------------------------------------------------------------"); Console.WriteLine(extractedResult.ToJsonString()); Console.WriteLine("------------------------------------------------------------------------------------------------------------------"); FlightTicket vietjetairResult = extractedResult.Get <FlightTicket>(); StringBuilder sb = CsvExportHelper.ExportList(new List <FlightTicket>() { vietjetairResult }); File.WriteAllText("FlightETicket.csv", sb.ToString()); Console.ReadLine(); }
private bool AttemptGenericAtomLinkDetection(string url, string html, bool preferredOnly) { const string GENERIC_ATOM_PROVIDER_ID = "D48F1B5A-06E6-4f0f-BD76-74F34F520792"; if (html == null) { return(false); } HtmlExtractor ex = new HtmlExtractor(html); if (ex .SeekWithin("<head>", "<body>") .SeekWithin("<link href rel='service' type='application/atomsvc+xml'>", "</head>") .Success) { IBlogProvider atomProvider = BlogProviderManager.FindProvider(GENERIC_ATOM_PROVIDER_ID); BeginTag bt = ex.Element as BeginTag; if (preferredOnly) { string classes = bt.GetAttributeValue("class"); if (classes == null) { return(false); } if (!Regex.IsMatch(classes, @"\bpreferred\b")) { return(false); } } string linkUrl = bt.GetAttributeValue("href"); Debug.WriteLine("Atom service link detected in the blog homepage"); _providerId = atomProvider.Id; _serviceName = atomProvider.Name; _clientType = atomProvider.ClientType; _blogName = string.Empty; _postApiUrl = GetAbsoluteUrl(url, linkUrl); IBlogClient client = BlogClientManager.CreateClient(atomProvider.ClientType, _postApiUrl, _credentials); client.VerifyCredentials(); _usersBlogs = client.GetUsersBlogs(); if (_usersBlogs.Length == 1) { _hostBlogId = _usersBlogs[0].Id; _blogName = _usersBlogs[0].Name; /* * if (_usersBlogs[0].HomepageUrl != null && _usersBlogs[0].HomepageUrl.Length > 0) * _homepageUrl = _usersBlogs[0].HomepageUrl; */ } // attempt to read the blog name from the homepage title if (_blogName == null || _blogName.Length == 0) { HtmlExtractor ex2 = new HtmlExtractor(html); if (ex2.Seek("<title>").Success) { _blogName = ex2.CollectTextUntil("title"); } } return(true); } return(false); }