private IEnumerable <FeedItem> ExtractFeedItemsFromSyndicationString(string value) { var items = new List <FeedItem>(); using (StringReader stringReader = new StringReader(value)) { string content = stringReader.ReadToEnd(); content = content.Replace("-0001 00:00:00 +0000", string.Format("{0} 00:00:00 +0000", DateTime.Now.Year)); byte[] bytes = System.Text.Encoding.UTF8.GetBytes(content); using (XmlReader reader = XmlReader.Create(new MemoryStream(bytes))) { SyndicationFeed feed = SyndicationFeed.Load(reader); foreach (SyndicationItem item in feed.Items) { try { items.Add(new FeedItem() { Title = htmlConverter.Convert(item.Title.Text), Summary = htmlConverter.Convert(item.Summary.Text), Url = item.Links[0].Uri, PublishedDate = item.PublishDate.DateTime.AddHours(5).ToLocalTime(), // adjust for EST IsNew = true }); } catch (Exception) { // ignore individual errors } } } } return(items); }
public void RunConversion(string input, string expected) { HtmlToTextConverter converter = new HtmlToTextConverter(); string output = converter.Convert(input); Assert.Equal(expected, output); }
public static async Task <string> HtmlToText(string html) { var config = Configuration.Default; var context = BrowsingContext.New(config); var document = await context.OpenAsync(req => req.Content(html)); var converter = new HtmlToTextConverter(); return(converter.Convert(document.Body)); }
public ImageItem[] GetDayContents(TrainingDayDTO day) { HtmlToTextConverter htmlConverter = new HtmlToTextConverter(); List <ImageItem> items = new List <ImageItem>(); foreach (var blog in day.Objects.OfType <BlogEntryDTO>()) { ImageItem item = new ImageItem(); item.BackBrush = EntryObjectColors.Blog; item.Content = (string)htmlConverter.Convert(blog.Comment, typeof(string), null, CultureInfo.CurrentCulture); item.Entry = blog; item.ToolTip = Name; item.Image = Image; items.Add(item); } return(items.ToArray()); }
static async Task Main(string[] args) { string url = "https://openfiber.it/mondo-open-fiber/comunicati-stampa/"; //url = "https://event.unitn.it/cerimonia-laurea/"; //url = "https://blog.botfactory.it"; //url = "https://www.trentinoinrete.it/Documentazioni-per-gli-Enti-Locali/Previsione-degli-interventi-per-comune"; HttpClient http = new HttpClient(); string html = await http.GetStringAsync(url); HtmlToTextConverter converter = new HtmlToTextConverter(); string output = converter.Convert(html); File.WriteAllText("out.txt", output); Console.WriteLine(output); }
/// <summary> /// Add documents. /// </summary> /// <param name="writer">The index writer.</param> /// <param name="directoryInfo">The directory information where all the files that are to be added are located.</param> /// <param name="files">The list of files that are to be added.</param> /// <param name="documents">The supported documents search filter, used to indicate what files are to be added.</param> public void AddDocuments(Lucene.Net.Index.IndexWriter writer, DirectoryInfo directoryInfo, string[] files, SupportedDocumentExtension documents) { Nequeo.Html.HtmlToTextConverter stream = new HtmlToTextConverter(); FieldType pathFieldType = new Lucene.Net.Documents.FieldType() { Indexed = true, Tokenized = false, Stored = true, IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, }; FieldType contentFieldType = new Lucene.Net.Documents.FieldType() { Indexed = true, Tokenized = documents.TokenizeContent, Stored = documents.StoreContent, IndexOptions = FieldInfo.IndexOptions.DOCS_AND_FREQS_AND_POSITIONS, }; // For each file. for (int i = 0; i < files.Length; i++) { // If the file exists if (File.Exists(files[i])) { Lucene.Net.Documents.Document document = new Lucene.Net.Documents.Document(); try { FileInfo fileInfo = new FileInfo(files[i]); string file = files[i].Replace(directoryInfo.Root.FullName, "").ToLower(); Lucene.Net.Documents.Field path = new Field("path", file.ToLower().Replace("\\", "/"), pathFieldType); Lucene.Net.Documents.Field modified = new Field("modified", fileInfo.LastWriteTime.ToShortDateString() + " " + fileInfo.LastWriteTime.ToShortTimeString(), pathFieldType); // Add the fields. document.Add(path); document.Add(modified); // Create the stream reader. string content = stream.Convert(files[i]); // If content exists. if (!String.IsNullOrEmpty(content)) { // Split the white spaces from the text. string[] words = content.Words(); // If words exist. if (words != null && words.Length > 0) { // Add the query for each word. for (int j = 0; j < words.Length; j++) { // Format the word. string word = words[j].ToLower().RemovePunctuationFromStartAndEnd(); // If a word exists. if (!String.IsNullOrEmpty(word)) { Lucene.Net.Documents.Field contentField = new Field("content", word, contentFieldType); document.Add(contentField); } } } } // Add the document. writer.AddDocument(document.Fields); // Commit after a set number of documents. documents.TotalDocumentSize += fileInfo.Length; if (documents.TotalDocumentSize > documents.MaxDocumentSizePerCommit) { writer.Commit(); documents.TotalDocumentSize = 0; } } catch (Exception) { throw; } } } }