示例#1
0
        private IEnumerable <FeedItem> ExtractFeedItemsFromSyndicationString(string value)
        {
            var items = new List <FeedItem>();

            using (StringReader stringReader = new StringReader(value))
            {
                string content = stringReader.ReadToEnd();
                content = content.Replace("-0001 00:00:00 +0000", string.Format("{0} 00:00:00 +0000", DateTime.Now.Year));
                byte[] bytes = System.Text.Encoding.UTF8.GetBytes(content);
                using (XmlReader reader = XmlReader.Create(new MemoryStream(bytes)))
                {
                    SyndicationFeed feed = SyndicationFeed.Load(reader);
                    foreach (SyndicationItem item in feed.Items)
                    {
                        try
                        {
                            items.Add(new FeedItem()
                            {
                                Title         = htmlConverter.Convert(item.Title.Text),
                                Summary       = htmlConverter.Convert(item.Summary.Text),
                                Url           = item.Links[0].Uri,
                                PublishedDate = item.PublishDate.DateTime.AddHours(5).ToLocalTime(), // adjust for EST
                                IsNew         = true
                            });
                        }
                        catch (Exception)
                        {
                            // ignore individual errors
                        }
                    }
                }
            }
            return(items);
        }
示例#2
0
        public void RunConversion(string input, string expected)
        {
            HtmlToTextConverter converter = new HtmlToTextConverter();
            string output = converter.Convert(input);

            Assert.Equal(expected, output);
        }
        public static async Task <string> HtmlToText(string html)
        {
            var config  = Configuration.Default;
            var context = BrowsingContext.New(config);

            var document = await context.OpenAsync(req => req.Content(html));

            var converter = new HtmlToTextConverter();

            return(converter.Convert(document.Body));
        }
        public ImageItem[] GetDayContents(TrainingDayDTO day)
        {
            HtmlToTextConverter htmlConverter = new HtmlToTextConverter();
            List <ImageItem>    items         = new List <ImageItem>();

            foreach (var blog in day.Objects.OfType <BlogEntryDTO>())
            {
                ImageItem item = new ImageItem();
                item.BackBrush = EntryObjectColors.Blog;
                item.Content   = (string)htmlConverter.Convert(blog.Comment, typeof(string), null, CultureInfo.CurrentCulture);
                item.Entry     = blog;
                item.ToolTip   = Name;
                item.Image     = Image;
                items.Add(item);
            }
            return(items.ToArray());
        }
示例#5
0
        static async Task Main(string[] args)
        {
            string url = "https://openfiber.it/mondo-open-fiber/comunicati-stampa/";
            //url = "https://event.unitn.it/cerimonia-laurea/";
            //url = "https://blog.botfactory.it";
            //url = "https://www.trentinoinrete.it/Documentazioni-per-gli-Enti-Locali/Previsione-degli-interventi-per-comune";

            HttpClient http = new HttpClient();
            string     html = await http.GetStringAsync(url);

            HtmlToTextConverter converter = new HtmlToTextConverter();
            string output = converter.Convert(html);

            File.WriteAllText("out.txt", output);

            Console.WriteLine(output);
        }
示例#6
0
        /// <summary>
        /// Add documents.
        /// </summary>
        /// <param name="writer">The index writer.</param>
        /// <param name="directoryInfo">The directory information where all the files that are to be added are located.</param>
        /// <param name="files">The list of files that are to be added.</param>
        /// <param name="documents">The supported documents search filter, used to indicate what files are to be added.</param>
        public void AddDocuments(Lucene.Net.Index.IndexWriter writer, DirectoryInfo directoryInfo, string[] files, SupportedDocumentExtension documents)
        {
            Nequeo.Html.HtmlToTextConverter stream = new HtmlToTextConverter();

            FieldType pathFieldType = new Lucene.Net.Documents.FieldType()
            {
                Indexed      = true,
                Tokenized    = false,
                Stored       = true,
                IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS,
            };
            FieldType contentFieldType = new Lucene.Net.Documents.FieldType()
            {
                Indexed      = true,
                Tokenized    = documents.TokenizeContent,
                Stored       = documents.StoreContent,
                IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS,
            };

            // For each file.
            for (int i = 0; i < files.Length; i++)
            {
                // If the file exists
                if (File.Exists(files[i]))
                {
                    Lucene.Net.Documents.Document document = new Lucene.Net.Documents.Document();

                    try
                    {
                        FileInfo fileInfo = new FileInfo(files[i]);
                        string   file     = files[i].Replace(directoryInfo.Root.FullName, "").ToLower();

                        Lucene.Net.Documents.Field path     = new Field("path", file.ToLower().Replace("\\", "/"), pathFieldType);
                        Lucene.Net.Documents.Field modified = new Field("modified", fileInfo.LastWriteTime.ToShortDateString() + " " + fileInfo.LastWriteTime.ToShortTimeString(), pathFieldType);

                        // Add the fields.
                        document.Add(path);
                        document.Add(modified);

                        // Create the stream reader.
                        string content = stream.Convert(files[i]);

                        // If content exists.
                        if (!String.IsNullOrEmpty(content))
                        {
                            // Split the white spaces from the text.
                            string[] words = content.Words();

                            // If words exist.
                            if (words != null && words.Length > 0)
                            {
                                // Add the query for each word.
                                for (int j = 0; j < words.Length; j++)
                                {
                                    // Format the word.
                                    string word = words[j].ToLower().RemovePunctuationFromStartAndEnd();

                                    // If a word exists.
                                    if (!String.IsNullOrEmpty(word))
                                    {
                                        Lucene.Net.Documents.Field contentField = new Field("content", word, contentFieldType);
                                        document.Add(contentField);
                                    }
                                }
                            }
                        }

                        // Add the document.
                        writer.AddDocument(document.Fields);

                        // Commit after a set number of documents.
                        documents.TotalDocumentSize += fileInfo.Length;
                        if (documents.TotalDocumentSize > documents.MaxDocumentSizePerCommit)
                        {
                            writer.Commit();
                            documents.TotalDocumentSize = 0;
                        }
                    }
                    catch (Exception)
                    {
                        throw;
                    }
                }
            }
        }