public void ExtractorHostTestTextPlain() { var host = new ExtractorHost(); host.RegisterScopedExtractor <PassthroughExtractor>(); host.RegisterScopedExtractor <DefaultHtmlExtractor>(); host.Initialize(); using (host.BeginServiceScope(out var extractor)) { var detailsForHtml = new IndexingRequestDetails(CultureInfo.InvariantCulture, Encoding.Unicode, "text/plain", string.Empty, string.Empty); var finalHtml = extractor.ExtractText( detailsForHtml, GetTestObjectStream() ); Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("1"))); Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("2"))); Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("3"))); Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("_"))); Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("4"))); Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("html"))); } }
public ExtractionResult ExtractText(IndexingRequestDetails details, Stream reader) { var available = Extractors.Where(e => e.CanExtract(details.Culture, details.MimeType, details.Schema)) .ToArray(); // first choice var choice = available.FirstOrDefault(e => e.GetType().AssemblyQualifiedName == details.Handler); if (choice != null) { return(choice.ExtractText(details, reader)); } // second choice choice = available.FirstOrDefault(e => e.GetType().FullName == details.Handler); if (choice != null) { return(choice.ExtractText(details, reader)); } // third choice choice = available.FirstOrDefault(e => e.GetType().Name == details.Handler); // ReSharper disable once ConvertIfStatementToReturnStatement if (choice != null) { return(choice.ExtractText(details, reader)); } // last choice return(available[0].ExtractText(details, reader)); }
public ExtractionResult ExtractText(IndexingRequestDetails details, Stream stream) { using (var reader = new StreamReader(stream, details.Encoding, false, 16, true)) { var doc = XDocument.Load(reader); var returnResult = new ExtractionResult(details); if (Extensions.Count == 0) { returnResult.GenerateSegments(doc, SimpleDegrapher.XElementDegrapher); } else { foreach (var ext in Extensions) { if (ext.TryProcess(returnResult, doc)) { break; } } } returnResult.AnnotateSegments(); return(returnResult); } }
public ExtractionResult ExtractText(IndexingRequestDetails details, Stream stream) { using (var reader = new StreamReader(stream, details.Encoding, false, 16, true)) { var document = reader.ReadToEnd(); var lines = document.Split('\r', '\n').Where(s => !string.IsNullOrEmpty(s)).ToArray(); var returnResult = new ExtractionResult(details); returnResult.GenerateSegments(lines, null); returnResult.AnnotateSegments(); return(returnResult); } }
public ExtractionResult ExtractText(IndexingRequestDetails details, Stream stream) { using (var reader = new StreamReader(stream, details.Encoding, false, 16, true)) { var obj = JsonConvert.DeserializeObject(reader.ReadToEnd()); var returnResult = new ExtractionResult(details); returnResult.GenerateSegments(obj, JsonDegrapher); returnResult.AnnotateSegments(); return(returnResult); } }
public void HtmlExtractorTest() { var extractor = new DefaultHtmlExtractor(); var details = IndexingRequestDetails.Create <DefaultHtmlExtractor>( CultureInfo.InvariantCulture, Encoding.Unicode, "text/html", string.Empty); var finalHtml = extractor.ExtractText(details, GetTestObjectStream()); Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("1"))); Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("2"))); Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("3"))); Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("_"))); Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("4"))); }
public ExtractionResult ExtractText(IndexingRequestDetails details, Stream stream) { using (var reader = new StreamReader(stream, details.Encoding, false, 16, true)) { var rows = new List <string>(); var row = string.Empty; while ((row = reader.ReadLine()) != null) { var entry = string.Join("\r\n", row.Split('\t').AsEnumerable().Reverse()); rows.Add(entry); } var returnResult = new ExtractionResult(details); returnResult.GenerateSegments(rows, null); returnResult.AnnotateSegments(); return(returnResult); } }
private IEnumerable <TextIndexingRequest> GetXmlFiles() { return(Directory.GetFiles("C:\\testDocSource\\", "*.xml", SearchOption.AllDirectories) .Take(10) .Select(fullFileName => new TextIndexingRequest( null, fullFileName, "text file", "", IndexingRequestDetails.Create <DefaultXmlExtractor>( CultureInfo.InvariantCulture, Encoding.ASCII, "text/xml", string.Empty), r => File.OpenRead(fullFileName) ))); }
public ExtractionResult ExtractText(IndexingRequestDetails details, Stream stream) { using (var reader = new StreamReader(stream, details.Encoding, false, 16, true)) { var config = Configuration.Default.WithDefaultLoader(); var document = new HtmlParser(config).Parse(reader.ReadToEnd()); // using degrapher because AngleSharp uses recursion var returnResult = new ExtractionResult(details); if (FullDocumentCapture) { returnResult.GenerateSegments(document.TextContent, HtmlDegrapher); } else { returnResult.GenerateSegments(document, HtmlDegrapher); } returnResult.AnnotateSegments(); return(returnResult); } }
public IndexingRequest(IndexingRequestDetails details) { Details = details; }