public ExtractionResult ExtractText(IndexingRequestDetails details, Stream stream) { using (var reader = new StreamReader(stream, details.Encoding, false, 16, true)) { var doc = XDocument.Load(reader); var returnResult = new ExtractionResult(details); if (Extensions.Count == 0) { returnResult.GenerateSegments(doc, SimpleDegrapher.XElementDegrapher); } else { foreach (var ext in Extensions) { if (ext.TryProcess(returnResult, doc)) { break; } } } returnResult.AnnotateSegments(); return(returnResult); } }
public ExtractionResult ExtractText(IndexingRequestDetails details, Stream stream) { using (var reader = new StreamReader(stream, details.Encoding, false, 16, true)) { var document = reader.ReadToEnd(); var lines = document.Split('\r', '\n').Where(s => !string.IsNullOrEmpty(s)).ToArray(); var returnResult = new ExtractionResult(details); returnResult.GenerateSegments(lines, null); returnResult.AnnotateSegments(); return(returnResult); } }
public ExtractionResult ExtractText(IndexingRequestDetails details, Stream stream) { using (var reader = new StreamReader(stream, details.Encoding, false, 16, true)) { var obj = JsonConvert.DeserializeObject(reader.ReadToEnd()); var returnResult = new ExtractionResult(details); returnResult.GenerateSegments(obj, JsonDegrapher); returnResult.AnnotateSegments(); return(returnResult); } }
public ExtractionResult ExtractText(IndexingRequestDetails details, Stream stream) { using (var reader = new StreamReader(stream, details.Encoding, false, 16, true)) { var rows = new List <string>(); var row = string.Empty; while ((row = reader.ReadLine()) != null) { var entry = string.Join("\r\n", row.Split('\t').AsEnumerable().Reverse()); rows.Add(entry); } var returnResult = new ExtractionResult(details); returnResult.GenerateSegments(rows, null); returnResult.AnnotateSegments(); return(returnResult); } }
public ExtractionResult ExtractText(IndexingRequestDetails details, Stream stream) { using (var reader = new StreamReader(stream, details.Encoding, false, 16, true)) { var config = Configuration.Default.WithDefaultLoader(); var document = new HtmlParser(config).Parse(reader.ReadToEnd()); // using degrapher because AngleSharp uses recursion var returnResult = new ExtractionResult(details); if (FullDocumentCapture) { returnResult.GenerateSegments(document.TextContent, HtmlDegrapher); } else { returnResult.GenerateSegments(document, HtmlDegrapher); } returnResult.AnnotateSegments(); return(returnResult); } }