コード例 #1
0
        private static bool ContainsEmbed(string input)
        {
            IElementPredicate predicate = new OrPredicate(new BeginTagPredicate("embed"), new BeginTagPredicate("object"));
            HtmlExtractor     ex        = new HtmlExtractor(input);

            return(ex.Seek(predicate).Success);
        }
コード例 #2
0
        public string RestorePreserved(string html)
        {
            StringBuilder sb  = new StringBuilder();
            HtmlExtractor ex  = new HtmlExtractor(html);
            int           pos = 0;

            while (ex.Seek("<span class='" + PRESERVE_CLASS + "'>").Success)
            {
                sb.Append(html, pos, ex.Element.Offset - pos);
                pos = ex.Element.Offset;
                BeginTag bt        = (BeginTag)ex.Element;
                string   elementId = bt.GetAttributeValue("id");
                Match    m         = Regex.Match(elementId ?? "", @"^preserve([a-zA-Z0-9]+)$");
                if (m.Success)
                {
                    string preserveId = m.Groups[1].Value;
                    string preservedValue;
                    if (preserved.TryGetValue(preserveId, out preservedValue))
                    {
                        sb.Append(preservedValue);
                        ex.CollectTextUntil("span");
                        if (ex.Element == null)
                        {
                            pos = html.Length;
                        }
                        else
                        {
                            pos = ex.Parser.Position;
                        }
                    }
                }
            }
            sb.Append(html, pos, html.Length - pos);
            return(sb.ToString());
        }
コード例 #3
0
        Size FindSizeAttribute(string input)
        {
            Size size = new Size(_width, _height);

            if (string.IsNullOrEmpty(input))
            {
                return(size);
            }

            try
            {
                RequiredAttribute[] attrWidth = new RequiredAttribute[] { new RequiredAttribute("width"), new RequiredAttribute("height") };
                IElementPredicate   predicate = new OrPredicate(new BeginTagPredicate("embed", attrWidth), new BeginTagPredicate("object", attrWidth));
                HtmlExtractor       ex        = new HtmlExtractor(input);
                if (ex.Seek(predicate).Success)
                {
                    BeginTag tag = (BeginTag)ex.Element;
                    size = new Size(Convert.ToInt32(tag.GetAttributeValue("width"), CultureInfo.InvariantCulture), Convert.ToInt32(tag.GetAttributeValue("height"), CultureInfo.InvariantCulture));
                }
            }
            catch (Exception ex)
            {
                Trace.Fail("Exception thrown while trying to find video size: " + ex);
            }

            return(size);
        }
コード例 #4
0
        public bool IsBlogger()
        {
            if (Regex.IsMatch(homepageUrl, @"^http://.+\.blogspot\.com($|/)", RegexOptions.IgnoreCase) ||
                Regex.IsMatch(homepageUrl, @"^http(s)?://(www\.)?blogger\.com($|/)", RegexOptions.IgnoreCase) ||
                new HtmlExtractor(html).Seek(new BloggerGeneratorCriterion()).Success)
            {
                return(true);
            }

            HtmlExtractor ex = new HtmlExtractor(html);

            while (ex.Seek("<link href rel='service.post' type='application/atom+xml'>").Success)
            {
                BeginTag bt       = (BeginTag)ex.Element;
                string   atomHref = bt.GetAttributeValue("href");

                // these obsolete Blogger atom links can't be used, but are
                // still a good indication that it's Blogger
                if (atomHref.StartsWith("https://www.blogger.com/atom/", StringComparison.OrdinalIgnoreCase))
                {
                    return(true);
                }

                // any other blogger or blogspot atom link will be considered a match
                if (Regex.IsMatch(atomHref, @"^https?\:\/\/.+\.blog(ger|spot)\.com\/.*", RegexOptions.IgnoreCase))
                {
                    return(true);
                }
            }

            return(false);
        }
コード例 #5
0
        public static ImageViewer DetectImageViewer(string html, string sourceUrl)
        {
            List <ImageViewer>         viewers = imageViewers;
            LazyLoader <List <Regex> > regexes = new LazyLoader <List <Regex> >(delegate
            {
                List <Regex> regexList = new List <Regex>(viewers.Count);
                foreach (ImageViewer v in viewers)
                {
                    regexList.Add(new Regex(v.Pattern, RegexOptions.CultureInvariant));
                }
                return(regexList);
            });

            HtmlExtractor ex = new HtmlExtractor(html);

            while (ex.Seek("<script src>").Success)
            {
                BeginTag tag = (BeginTag)ex.Element;
                string   src = tag.GetAttributeValue("src");

                if (String.IsNullOrEmpty(src))
                {
                    continue;
                }

                try
                {
                    if (!UrlHelper.IsUrl(src))
                    {
                        // We need absolute URLs.
                        src = UrlHelper.EscapeRelativeURL(sourceUrl, src);
                    }

                    Uri srcUri = new Uri(src);
                    if (srcUri.IsAbsoluteUri)
                    {
                        // WinLive 248276: We want just the path portion since there could be an additional query or
                        // fragment on the URL that our regexs can't handle.
                        src = srcUri.GetLeftPart(UriPartial.Path);
                    }
                }
                catch (UriFormatException)
                {
                    // We'll just use the regex on the raw attribute value.
                }

                List <Regex> regexList = regexes.Value;
                for (int i = 0; i < regexList.Count; i++)
                {
                    if (regexList[i].IsMatch(src))
                    {
                        return(viewers[i]);
                    }
                }
            }
            return(null);
        }
コード例 #6
0
        public Match MatchHomepageText(Regex regex)
        {
            string        html = DownloadHomepage();
            HtmlExtractor ex   = new HtmlExtractor(html);

            if (ex.Seek(new DelegatePredicate(e => e is Text && regex.IsMatch(e.ToString()))).Success)
            {
                return(regex.Match(ex.Element.ToString()));
            }
            return(Match.Empty);
        }
コード例 #7
0
 private static bool ContainsEmbedOrObject(string html)
 {
     try
     {
         IElementPredicate predicate = new OrPredicate(new BeginTagPredicate("embed"), new BeginTagPredicate("object"), new BeginTagPredicate("iframe"));
         HtmlExtractor     ex        = new HtmlExtractor(html);
         return(ex.Seek(predicate).Success);
     }
     catch
     {
         return(false);
     }
 }
        private IExtractionResult _Extract(Stream templateStream, Stream source)
        {
            IExtractionResult result = null;

            try
            {
                HtmlExtractor _Template = HtmlExtractor.Load(templateStream);
                result = _Template.Extract(source);
            }
            catch (Exception e)
            {
                Console.WriteLine(e.StackTrace);
            }
            return(result);
        }
コード例 #9
0
        public void ContentParseTest()
        {
            string        content     = "China and India are making the planet greener, NASA says";
            HtmlExtractor htmlExtract = new HtmlExtractor();

            htmlExtract.SetHtml($"<div><h1 class=\"pg-headline\">{content}</h1></div>");

            string extContent = htmlExtract.ParseFirst(new XpathExtractModel()
            {
                ExtractType        = ExtractType.Text,
                XpathRule          = "//h1[@class='pg-headline']",
                XpathEndAttributes = null
            });

            Assert.True(extContent == content);
        }
コード例 #10
0
        public bool ShouldUpdateContent(string oldHTML, string newHTML)
        {
            HtmlExtractor exOld = new HtmlExtractor(oldHTML);
            HtmlExtractor exNew = new HtmlExtractor(newHTML);

            HtmlExtractor exImgOld = exOld.Seek("<img title>");
            HtmlExtractor exImgNew = exNew.Seek("<img title>");

            if (exImgOld.Success &&
                exImgNew.Success &&
                ((BeginTag)exImgOld.Element).GetAttributeValue("title") == ((BeginTag)exImgNew.Element).GetAttributeValue("title"))
            {
                return(false);
            }

            return(true);
        }
コード例 #11
0
        public override BlogPost Parse(XmlElement entryNode, bool includeCategories, Uri documentUri)
        {
            BlogPost  post      = new BlogPost();
            AtomEntry atomEntry = new AtomEntry(_atomVer, _atomNS, CategoryScheme, _nsMgr, documentUri, entryNode);

            post.Title     = atomEntry.Title;
            post.Excerpt   = atomEntry.Excerpt;
            post.Id        = PostUriToPostId(atomEntry.EditUri);
            post.Permalink = atomEntry.Permalink;

            string content = atomEntry.ContentHtml;

            if (content.Trim() != string.Empty)
            {
                HtmlExtractor ex = new HtmlExtractor(content);
                int           start, length;
                if (Options.SupportsExtendedEntries && ex.Seek("<a name=\"more\">").Success)
                {
                    start  = ex.Element.Offset;
                    length = ex.Element.Length;
                    if (ex.Seek("</a>").Success)
                    {
                        post.SetContents(content.Substring(0, start), content.Substring(ex.Element.Offset + ex.Element.Length));
                    }
                    else
                    {
                        post.SetContents(content.Substring(0, start), content.Substring(start + length));
                    }
                }
                else
                {
                    post.Contents = content;
                }
            }

            post.DatePublished = atomEntry.PublishDate;
            if (Options.SupportsCategories && includeCategories)
            {
                post.Categories = atomEntry.Categories;
            }

            return(post);
        }
コード例 #12
0
        // Warning: Does not deal with escaping properly. This is fine as long as
        // we're only using it for content we generate and there are no security
        // impliciations.
        public static string StripDivsWithClass(string html, string cssClass)
        {
            if (html.IndexOf(cssClass) < 0)
            {
                return(html);
            }

            StringBuilder sb  = new StringBuilder();
            HtmlExtractor ex  = new HtmlExtractor(html);
            int           pos = 0;

            while (ex.Seek("<div class='" + cssClass + "'>").Success)
            {
                sb.Append(html, pos, ex.Element.Offset - pos);
                ex.Parser.CollectHtmlUntil("div");
                pos = ex.Parser.Position;
            }
            sb.Append(html, pos, html.Length - pos);
            return(sb.ToString());
        }
コード例 #13
0
        public void HtmlContentParseTest()
        {
            string        url         = "https://ai.baidu.com";
            string        htmlContent = $"<div class=\"ai-nav-menu-item-list\"><a href=\"{url}\" target=\"_blank\" class=\"ai-nav-menu-item-list-item\">test001</a></div>";
            HtmlExtractor htmlExtract = new HtmlExtractor();

            htmlExtract.SetHtml(htmlContent);

            List <string> urls = htmlExtract.ParseList(new XpathExtractModel()
            {
                ExtractType        = ExtractType.Text,
                XpathRule          = "//a[@href]",
                XpathEndAttributes = new List <string>()
                {
                    "href"
                }
            });

            Assert.True(urls.Exists(e => e == url));
        }
コード例 #14
0
 private string IdFromEmbed(string input)
 {
     foreach (EmbedPattern check in _embedPatterns)
     {
         IElementPredicate predicate = new BeginTagPredicate("embed", new RequiredAttribute[] { new RequiredAttribute(check.Attr) });
         HtmlExtractor     ex        = new HtmlExtractor(input);
         ex = ex.Seek(predicate);
         if (ex.Success)
         {
             BeginTag bt     = ex.Element as BeginTag;
             string   srcRef = bt.GetAttributeValue(check.Attr);
             Match    m      = Regex.Match(srcRef, check.Pattern, RegexOptions.IgnoreCase);
             if (m.Success && m.Groups["id"].Success)
             {
                 return(m.Groups["id"].Value);
             }
         }
     }
     return(String.Empty);
 }
コード例 #15
0
        public override List <Link> ParseUrls(string htmlSource)
        {
            var domain      = HtmlExtractor.GetDomainName(this.Link.Uri);
            var regxInclude = "[^>]*" + domain + "[^>]*";

            var temp = HtmlExtractor.UrlCollection(regxInclude, "", htmlSource, this.Link.Uri);
            var res  = new List <Link>();

            foreach (var t in temp)
            {
                res.Add(new Link()
                {
                    Deep            = 0,
                    ParseCompleted  = false,
                    Uri             = t,
                    UriParent       = this.Link.Uri,
                    ParseInprogress = false,
                    Id = t.UrlToHashCode()
                });
            }
            return(res);
        }
コード例 #16
0
        public override List <File> ParseFiles(string htmlSource)
        {
            var domain      = HtmlExtractor.GetDomainName(this.Link.Uri);
            var regxInclude = "[^>]*" + domain + "[^>]*";

            var temp = HtmlExtractor.UrlCollectionWithHref(regxInclude, "", htmlSource, this.Link.Uri);
            var res  = new List <File>();

            foreach (var t in temp)
            {
                res.Add(new File()
                {
                    DownloadCompleted = false,
                    MergerCompleted   = false,
                    PathOnDisk        = "",
                    PathOnWeb         = t.Key,
                    Href = t.Value,
                    DownloadInprogress = false,
                    Id = t.Key.UrlToHashCode()
                });
            }
            var src = HtmlExtractor.UrlImgCollectionWithSrc(htmlSource, this.Link.Uri);

            foreach (var s in src)
            {
                res.Add(new File()
                {
                    DownloadCompleted = false,
                    MergerCompleted   = false,
                    PathOnDisk        = "",
                    PathOnWeb         = s.Key,
                    Href = s.Value,
                    DownloadInprogress = false,
                    Id = s.Key.UrlToHashCode()
                });
            }
            return(res);
        }
コード例 #17
0
        private void ParsePostContent(IXmlNode xmlNode, BlogPost blogPost)
        {
            // get raw content (decode base64 if necessary)
            string content;
            var    base64Node = xmlNode.SelectSingleNode("base64");

            if (base64Node != null)
            {
                byte[] contentBytes = Convert.FromBase64String(base64Node.InnerText);
                content = _utf8EncodingNoBOM.GetString(contentBytes);
            }
            else // no base64 encoding, just read text
            {
                content = xmlNode.InnerText;
            }

            // parse out the title and contents of the post
            HtmlExtractor ex = new HtmlExtractor(content);

            if (ex.Seek("<title>").Success)
            {
                SetPostTitleFromXmlValue(blogPost, ex.CollectTextUntil("title"));
                content = content.Substring(ex.Parser.Position).TrimStart('\r', '\n');
            }

            if (content.Trim() != string.Empty)
            {
                HtmlExtractor ex2 = new HtmlExtractor(content);
                if (Options.SupportsExtendedEntries && ex2.Seek("<lj-cut>").Success)
                {
                    blogPost.SetContents(content.Substring(0, ex2.Element.Offset), content.Substring(ex2.Element.Offset + ex2.Element.Length));
                }
                else
                {
                    blogPost.Contents = content;
                }
            }
        }
コード例 #18
0
 public bool MatchesEmbed(string input)
 {
     foreach (EmbedPattern check in _embedPatterns)
     {
         IElementPredicate predicate = new BeginTagPredicate("embed", new RequiredAttribute[] { new RequiredAttribute(check.Attr) });
         HtmlExtractor     ex        = new HtmlExtractor(input);
         ex = ex.Seek(predicate);
         if (ex.Success)
         {
             BeginTag bt     = ex.Element as BeginTag;
             string   srcRef = bt.GetAttributeValue(check.Attr);
             if (!Regex.IsMatch(srcRef, check.Pattern, RegexOptions.IgnoreCase))
             {
                 return(false);
             }
         }
         else
         {
             return(false); //didn't find embed tag with the attr
         }
     }
     return(true); //found all predicates
 }
コード例 #19
0
        public string Extract(string source)
        {
            var temp = string.IsNullOrEmpty(ElementById) ? "" : HtmlExtractor.ContentByIdOrName(ElementById, source);

            if (string.IsNullOrEmpty(temp) && !string.IsNullOrEmpty(ElementByTagName) && !string.IsNullOrEmpty(ElementByClassName))
            {
                temp = HtmlExtractor.ContentByTagAndClassNameAndIndex(ElementByTagName, ElementByClassName, source,
                                                                      ElementByIndex);
            }
            if (string.IsNullOrEmpty(temp) && !string.IsNullOrEmpty(ElementByTagName))
            {
                temp = HtmlExtractor.ContentByTagNameAndIndex(ElementByTagName, ElementByIndex, source);
            }
            if (string.IsNullOrEmpty(temp) && !string.IsNullOrEmpty(ElementByClassName))
            {
                temp = HtmlExtractor.ContentByClassNameAndIndex(ElementByTagName, source, ElementByIndex);
            }
            if (string.IsNullOrEmpty(temp))
            {
                temp = HtmlExtractor.ContentByTagNameAndIndex("body", ElementByIndex, source);
            }
            ExtractedContent = temp;
            return(temp);
        }
コード例 #20
0
        static void Main(string[] args)
        {
            Console.WriteLine("/*---------------------------------------*/");
            Console.WriteLine("/* HQ Plus Selection Tests By Joao Prado */");
            Console.WriteLine("/*   Task 1A - Generate JSON From File   */");
            Console.WriteLine("/*---------------------------------------*/");
            Console.WriteLine();
            Console.WriteLine();

            string folder   = Directory.GetCurrentDirectory();
            string fileName = "task1.html";

            var htmlExtractorFromFile = new HtmlExtractor(folder, fileName);
            var hotelModelA           = htmlExtractorFromFile.GetHotelInformation();

            var options = new JsonSerializerOptions
            {
                PropertyNamingPolicy = JsonNamingPolicy.CamelCase,
                WriteIndented        = true
            };

            var modelJson = JsonSerializer.Serialize(hotelModelA, options);

            Console.WriteLine("Extracted JSON from file:");
            Console.WriteLine("--------------------------------------");
            Console.WriteLine(modelJson);
            Console.WriteLine("--------------------------------------");
            Console.WriteLine("Finished task 1A get JSON from file.");
            Console.WriteLine("Press any key to continue next task.");

            Console.ReadLine();

            Console.WriteLine("/*---------------------------------------*/");
            Console.WriteLine("/* HQ Plus Selection Tests By Joao Prado */");
            Console.WriteLine("/*  Task 1B - Generate JSON From String  */");
            Console.WriteLine("/*---------------------------------------*/");

            var htmlString = File.ReadAllText(Path.Combine(folder, fileName));
            var htmlExtractorFromString = new HtmlExtractor(htmlString);
            var hotelModelB             = htmlExtractorFromString.GetHotelInformation();

            modelJson = JsonSerializer.Serialize(hotelModelB, options);

            Console.WriteLine();
            Console.WriteLine("Extracted JSON from string:");
            Console.WriteLine("--------------------------------------");
            Console.WriteLine(modelJson);
            Console.WriteLine("--------------------------------------");
            Console.WriteLine("Finished task 1B get JSON from string.");
            Console.WriteLine("Press any key to continue next task.");

            Console.ReadLine();

            Console.WriteLine("/*---------------------------------------*/");
            Console.WriteLine("/* HQ Plus Selection Tests By Joao Prado */");
            Console.WriteLine("/*  Task 1C - Generate JSON From Stream  */");
            Console.WriteLine("/*---------------------------------------*/");
            Console.WriteLine();

            using (var fileStream = new FileStream(Path.Combine(folder, fileName), FileMode.Open, FileAccess.Read))
            {
                var htmlExtractorFromStream = new HtmlExtractor(fileStream);
                var hotelModelC             = htmlExtractorFromStream.GetHotelInformation();
                modelJson = JsonSerializer.Serialize(hotelModelC, options);

                Console.WriteLine("Extracted JSON from stream:");
                Console.WriteLine("--------------------------------------");
                Console.WriteLine(modelJson);
                Console.WriteLine("--------------------------------------");
            }

            Console.WriteLine("Finished task 1C get JSON from string.");
            Console.WriteLine("HQ Plus Task 1 finished.");
            Console.WriteLine("Press any key to exit. ");
            Console.ReadLine();
        }
コード例 #21
0
        /// <summary>
        /// Any setting that is derived from the homepage html needs to be in this function.  This function is turned
        /// on and off when detecting blog settings through the IncludeHomePageSettings.  None of these checks will be run
        /// if the internet is not active.  As each check is made, it does not need to be applied back the _content until the end
        /// at which time it will write the settings back to the registry.
        /// </summary>
        private void DetectHomePageSettings()
        {
            if (_homepageAccessor.HtmlDocument == null)
            {
                return;
            }

            IDictionary homepageSettings = new Hashtable();

            Debug.Assert(!UseManifestCache, "This code will not run correctly under the manifest cache, due to option overrides not being set");

            LightWeightHTMLMetaData metaData = new LightWeightHTMLMetaData(_homepageAccessor.HtmlDocument);

            if (metaData.Charset != null)
            {
                try
                {
                    homepageSettings.Add(BlogClientOptions.CHARACTER_SET, metaData.Charset);
                }
                catch (NotSupportedException)
                {
                    //not an actual encoding
                }
            }

            string docType = new LightWeightHTMLMetaData(_homepageAccessor.HtmlDocument).DocType;

            if (docType != null)
            {
                bool xhtml = docType.IndexOf("xhtml", StringComparison.OrdinalIgnoreCase) >= 0;
                if (xhtml)
                {
                    homepageSettings.Add(BlogClientOptions.REQUIRES_XHTML, true.ToString(CultureInfo.InvariantCulture));
                }
            }

            //checking whether blog is rtl
            HtmlExtractor extractor = new HtmlExtractor(_homepageAccessor.HtmlDocument.RawHtml);

            if (extractor.Seek(new OrPredicate(
                                   new SmartPredicate("<html dir>"),
                                   new SmartPredicate("<body dir>"))).Success)
            {
                BeginTag tag = (BeginTag)extractor.Element;
                string   dir = tag.GetAttributeValue("dir");
                if (String.Compare(dir, "rtl", StringComparison.OrdinalIgnoreCase) == 0)
                {
                    homepageSettings.Add(BlogClientOptions.TEMPLATE_IS_RTL, true.ToString(CultureInfo.InvariantCulture));
                }
            }

            if (_homepageAccessor.HtmlDocument != null)
            {
                string      html   = _homepageAccessor.OriginalHtml;
                ImageViewer viewer = DhtmlImageViewers.DetectImageViewer(html, _context.HomepageUrl);
                if (viewer != null)
                {
                    homepageSettings.Add(BlogClientOptions.DHTML_IMAGE_VIEWER, viewer.Name);
                }
            }

            _context.HomePageOverrides = homepageSettings;
        }
コード例 #22
0
        public static void Main()
        {
            C1.TextParser.LicenseManager.Key = License.Key;

            /**************************************************Amazon template*********************************************/
            Stream        amazonTemplateStream = File.Open(@"amazonEmail1.html", FileMode.Open);
            HtmlExtractor amazonTemplate       = new HtmlExtractor(amazonTemplateStream);

            //Repeated block for each article in the order
            String articleNameXPath = @"//*[@id=""shipmentDetails""]/table/tbody/tr[1]/td[2]/p/a";

            amazonTemplate.AddPlaceHolder("ordered articles", "article name", articleNameXPath);
            String articlePriceXPath = @"//*[@id=""shipmentDetails""]/table/tbody/tr[1]/td[3]/strong";

            amazonTemplate.AddPlaceHolder("ordered articles", "article price", articlePriceXPath);
            String articleSellerXPath = @"//*[@id=""shipmentDetails""]/table/tbody/tr[1]/td[2]/p/span";

            amazonTemplate.AddPlaceHolder("ordered articles", "article seller", articleSellerXPath, 8, 18);

            //Fixed placeHolder for the expected delivery date
            String deliveryDateXPath = @"/html/body/div[2]/div/div/div/table/tbody/tr[3]/td/table/tbody/tr[1]/td[1]/p/strong";

            amazonTemplate.AddPlaceHolder("delivery date", deliveryDateXPath);

            //Fixed placeHolder for the total amount of the order
            String totalAmountXPath = @"//*[@id=""shipmentDetails""]/table/tbody/tr[8]/td[2]/strong";

            amazonTemplate.AddPlaceHolder("total order amount", totalAmountXPath);

            //Fixed placeHolder for the customer name
            String customerNameXPath = @"/html/body/div[2]/div/div/div/table/tbody/tr[2]/td/p[1]";

            amazonTemplate.AddPlaceHolder("customer name", customerNameXPath, 6, 15);
            /***************************************************************************************************************/

            Stream            source          = File.Open(@"amazonEmail2.html", FileMode.Open);
            IExtractionResult extractedResult = amazonTemplate.Extract(source);

            Console.WriteLine("------------------------------------------------------------------------------------------------------------");
            Console.WriteLine("GrapeCity, inc, all rights reserved");
            Console.WriteLine("Demo of the C1TextParser library - Html extractor sample");
            Console.WriteLine("Test case: From amazon order emails extract relevant information about the order itself.");
            Console.WriteLine("           This sample pretends to demonstrate the repeated place holder extraction capabilities of");
            Console.WriteLine("           C1TextParser - Html extractor");
            Console.WriteLine("Detail: The sample consists on three fixed place holders and one repeated block. The fixed place holders are");
            Console.WriteLine("        the customer name, the order delivery date and also the total amount of the order. The repeated ");
            Console.WriteLine("        block is used to extract each article that appear in the ordered article list. It contains three");
            Console.WriteLine("        repeated place holders. These are: the name, the price and the seller of the article.");
            Console.WriteLine("        The amazon email used as the extraction source is \"amazonEmail2.html\" and can be consulted in the");
            Console.WriteLine("        current working directory. Also, \"ECommerceOrder.csv\" contains the parsing result");
            Console.WriteLine("------------------------------------------------------------------------------------------------------------");

            Console.WriteLine("------------------------------------------------------------------------------------------------------------");
            Console.WriteLine("JSon String result:");
            Console.WriteLine("------------------------------------------------------------------------------------------------------------");
            Console.WriteLine(extractedResult.ToJsonString());
            Console.WriteLine("------------------------------------------------------------------------------------------------------------");

            AmazonTemplateFixedPlaceHolders amazonTemplateFixedPlaceHolders = extractedResult.Get <AmazonTemplateFixedPlaceHolders>();
            StringBuilder sb1 = CsvExportHelper.ExportList(new List <AmazonTemplateFixedPlaceHolders>()
            {
                amazonTemplateFixedPlaceHolders
            });
            var           amazonTemplateOrderedItems = extractedResult.Get <AmazonTemplateRepeatedBlocks>().OrderedItems;
            StringBuilder sb2 = CsvExportHelper.ExportList(amazonTemplateOrderedItems);
            var           sb3 = sb1 + "\n" + sb2;

            File.WriteAllText("ECommerceOrder.csv", sb3);

            Console.ReadLine();
        }
コード例 #23
0
        public static void Main()
        {
            C1.TextParser.LicenseManager.Key = License.Key;

            /***********************************************Vietjetair template********************************************/
            Stream        vietjetairTemplateStream = File.Open(@"vietjetairEmail1.html", FileMode.Open);
            HtmlExtractor vietjetairTemplate       = new HtmlExtractor(vietjetairTemplateStream);

            //Fixed placeHolder for the passenger name
            String passengerNameXPath = @"/html/body/div/div[4]/div[1]/div[2]/div[2]/table[2]/tbody/tr[3]/td";

            vietjetairTemplate.AddPlaceHolder("passenger name", passengerNameXPath);

            //Fixed placeHolder for the booking number
            String bookingNumberXPath = @"/html/body/div/div[4]/div[1]/div[2]/div[2]/table[1]/tbody/tr/td[2]/span";

            vietjetairTemplate.AddPlaceHolder("booking number", bookingNumberXPath);

            //Fixed placeHolder for the booking status
            String bookingStatusXPath = @"/html/body/div/div[4]/div[1]/div[2]/div[2]/table[2]/tbody/tr[1]/td[1]";

            vietjetairTemplate.AddPlaceHolder("booking status", bookingStatusXPath);

            //Fixed placeHolder for the fare type
            String fareTypeXPath = @"/html/body/div/div[4]/div[1]/div[2]/div[2]/table[4]/tbody/tr/td[3]";

            vietjetairTemplate.AddPlaceHolder("fare type", fareTypeXPath);

            //Fixed placeHolder for total amount
            String totalAmountXPath = @"/html/body/div/div[4]/div[1]/div[2]/div[2]/table[6]/tbody/tr[2]/td/table[2]/tbody/tr[2]/td[3]";

            vietjetairTemplate.AddPlaceHolder("total amount", totalAmountXPath);

            //Fixed placeHolder for city of departure
            String cityOfDepartureXPath = @"/html/body/div/div[4]/div[1]/div[2]/div[2]/table[4]/tbody/tr/td[4]/text()";

            vietjetairTemplate.AddPlaceHolder("city of departure", cityOfDepartureXPath, 8, 12);

            //Fixed placeHolder for year of booking date
            String yearOfBookingXPath = @"/html/body/div/div[4]/div[1]/div[2]/div[2]/table[2]/tbody/tr[2]/td[1]";

            vietjetairTemplate.AddPlaceHolder("year of booking", yearOfBookingXPath, 6, 4);
            /***************************************************************************************************************/

            Stream            source          = File.Open(@"vietjetairEmail2.html", FileMode.Open);
            IExtractionResult extractedResult = vietjetairTemplate.Extract(source);

            Console.WriteLine("------------------------------------------------------------------------------------------------------------------");
            Console.WriteLine("GrapeCity, inc, all rights reserved");
            Console.WriteLine("Demo of the C1TextParser library - Html extractor sample");
            Console.WriteLine("Test case: Test case: From a vietjetair e-ticket extract relevant information about the flight. Note that the");
            Console.WriteLine("           email used as extraction source was modified on purpose (added random text at different locations)");
            Console.WriteLine("           with the intent to show that html extractor is flexible enough to retrieve the intended text.");
            Console.WriteLine("Detail: This consists on seven fixed place holders. These are: the passenger name, the booking number, the");
            Console.WriteLine("        booking status, the fare type, the total amount, the city of departure and, finally, the year of booking");
            Console.WriteLine("        The vietjetair email used as the extraction source is \"vietjetairEmail2.html\" and can be consulted");
            Console.WriteLine("        in the current working directory. Also, \"FlightETicket.csv\" contains the parsing result");
            Console.WriteLine("------------------------------------------------------------------------------------------------------------------");

            Console.WriteLine("------------------------------------------------------------------------------------------------------------------");
            Console.WriteLine("JSon String result:");
            Console.WriteLine("------------------------------------------------------------------------------------------------------------------");
            Console.WriteLine(extractedResult.ToJsonString());
            Console.WriteLine("------------------------------------------------------------------------------------------------------------------");

            FlightTicket  vietjetairResult = extractedResult.Get <FlightTicket>();
            StringBuilder sb = CsvExportHelper.ExportList(new List <FlightTicket>()
            {
                vietjetairResult
            });

            File.WriteAllText("FlightETicket.csv", sb.ToString());

            Console.ReadLine();
        }
コード例 #24
0
        private bool AttemptGenericAtomLinkDetection(string url, string html, bool preferredOnly)
        {
            const string GENERIC_ATOM_PROVIDER_ID = "D48F1B5A-06E6-4f0f-BD76-74F34F520792";

            if (html == null)
            {
                return(false);
            }

            HtmlExtractor ex = new HtmlExtractor(html);

            if (ex
                .SeekWithin("<head>", "<body>")
                .SeekWithin("<link href rel='service' type='application/atomsvc+xml'>", "</head>")
                .Success)
            {
                IBlogProvider atomProvider = BlogProviderManager.FindProvider(GENERIC_ATOM_PROVIDER_ID);

                BeginTag bt = ex.Element as BeginTag;

                if (preferredOnly)
                {
                    string classes = bt.GetAttributeValue("class");
                    if (classes == null)
                    {
                        return(false);
                    }
                    if (!Regex.IsMatch(classes, @"\bpreferred\b"))
                    {
                        return(false);
                    }
                }

                string linkUrl = bt.GetAttributeValue("href");

                Debug.WriteLine("Atom service link detected in the blog homepage");

                _providerId  = atomProvider.Id;
                _serviceName = atomProvider.Name;
                _clientType  = atomProvider.ClientType;
                _blogName    = string.Empty;
                _postApiUrl  = GetAbsoluteUrl(url, linkUrl);

                IBlogClient client = BlogClientManager.CreateClient(atomProvider.ClientType, _postApiUrl, _credentials);
                client.VerifyCredentials();
                _usersBlogs = client.GetUsersBlogs();
                if (_usersBlogs.Length == 1)
                {
                    _hostBlogId = _usersBlogs[0].Id;
                    _blogName   = _usersBlogs[0].Name;

                    /*
                     *                  if (_usersBlogs[0].HomepageUrl != null && _usersBlogs[0].HomepageUrl.Length > 0)
                     *                      _homepageUrl = _usersBlogs[0].HomepageUrl;
                     */
                }

                // attempt to read the blog name from the homepage title
                if (_blogName == null || _blogName.Length == 0)
                {
                    HtmlExtractor ex2 = new HtmlExtractor(html);
                    if (ex2.Seek("<title>").Success)
                    {
                        _blogName = ex2.CollectTextUntil("title");
                    }
                }

                return(true);
            }
            return(false);
        }