public void ExtractorHostTestTextPlain()
        {
            var host = new ExtractorHost();

            host.RegisterScopedExtractor <PassthroughExtractor>();
            host.RegisterScopedExtractor <DefaultHtmlExtractor>();
            host.Initialize();
            using (host.BeginServiceScope(out var extractor)) {
                var detailsForHtml = new IndexingRequestDetails(CultureInfo.InvariantCulture,
                                                                Encoding.Unicode,
                                                                "text/plain",
                                                                string.Empty,
                                                                string.Empty);
                var finalHtml = extractor.ExtractText(
                    detailsForHtml,
                    GetTestObjectStream()
                    );
                Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("1")));
                Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("2")));
                Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("3")));
                Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("_")));
                Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("4")));
                Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("html")));
            }
        }
Example #2
0
        public ExtractionResult ExtractText(IndexingRequestDetails details, Stream reader)
        {
            var available = Extractors.Where(e => e.CanExtract(details.Culture, details.MimeType, details.Schema))
                            .ToArray();

            // first choice
            var choice = available.FirstOrDefault(e => e.GetType().AssemblyQualifiedName == details.Handler);

            if (choice != null)
            {
                return(choice.ExtractText(details, reader));
            }

            // second choice
            choice = available.FirstOrDefault(e => e.GetType().FullName == details.Handler);
            if (choice != null)
            {
                return(choice.ExtractText(details, reader));
            }

            // third choice
            choice = available.FirstOrDefault(e => e.GetType().Name == details.Handler);
            // ReSharper disable once ConvertIfStatementToReturnStatement
            if (choice != null)
            {
                return(choice.ExtractText(details, reader));
            }

            // last choice
            return(available[0].ExtractText(details, reader));
        }
Example #3
0
        public ExtractionResult ExtractText(IndexingRequestDetails details, Stream stream)
        {
            using (var reader = new StreamReader(stream, details.Encoding, false, 16, true)) {
                var doc = XDocument.Load(reader);

                var returnResult = new ExtractionResult(details);

                if (Extensions.Count == 0)
                {
                    returnResult.GenerateSegments(doc, SimpleDegrapher.XElementDegrapher);
                }
                else
                {
                    foreach (var ext in Extensions)
                    {
                        if (ext.TryProcess(returnResult, doc))
                        {
                            break;
                        }
                    }
                }

                returnResult.AnnotateSegments();
                return(returnResult);
            }
        }
        public ExtractionResult ExtractText(IndexingRequestDetails details, Stream stream)
        {
            using (var reader = new StreamReader(stream, details.Encoding, false, 16, true)) {
                var document = reader.ReadToEnd();
                var lines    = document.Split('\r', '\n').Where(s => !string.IsNullOrEmpty(s)).ToArray();

                var returnResult = new ExtractionResult(details);
                returnResult.GenerateSegments(lines, null);
                returnResult.AnnotateSegments();

                return(returnResult);
            }
        }
        public ExtractionResult ExtractText(IndexingRequestDetails details, Stream stream)
        {
            using (var reader = new StreamReader(stream, details.Encoding, false, 16, true)) {
                var obj = JsonConvert.DeserializeObject(reader.ReadToEnd());


                var returnResult = new ExtractionResult(details);
                returnResult.GenerateSegments(obj, JsonDegrapher);
                returnResult.AnnotateSegments();

                return(returnResult);
            }
        }
        public void HtmlExtractorTest()
        {
            var extractor = new DefaultHtmlExtractor();
            var details   = IndexingRequestDetails.Create <DefaultHtmlExtractor>(
                CultureInfo.InvariantCulture,
                Encoding.Unicode,
                "text/html",
                string.Empty);
            var finalHtml = extractor.ExtractText(details, GetTestObjectStream());

            Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("1")));
            Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("2")));
            Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("3")));
            Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("_")));
            Assert.IsTrue(finalHtml.ExtractionPointDetails.Any(c => c.Segment.Contains("4")));
        }
        public ExtractionResult ExtractText(IndexingRequestDetails details, Stream stream)
        {
            using (var reader = new StreamReader(stream, details.Encoding, false, 16, true)) {
                var rows = new List <string>();
                var row  = string.Empty;
                while ((row = reader.ReadLine()) != null)
                {
                    var entry = string.Join("\r\n", row.Split('\t').AsEnumerable().Reverse());
                    rows.Add(entry);
                }


                var returnResult = new ExtractionResult(details);

                returnResult.GenerateSegments(rows, null);
                returnResult.AnnotateSegments();
                return(returnResult);
            }
        }
Example #8
0
 private IEnumerable <TextIndexingRequest> GetXmlFiles()
 {
     return(Directory.GetFiles("C:\\testDocSource\\",
                               "*.xml",
                               SearchOption.AllDirectories)
            .Take(10)
            .Select(fullFileName => new TextIndexingRequest(
                        null,
                        fullFileName,
                        "text file",
                        "",
                        IndexingRequestDetails.Create <DefaultXmlExtractor>(
                            CultureInfo.InvariantCulture,
                            Encoding.ASCII,
                            "text/xml",
                            string.Empty),
                        r => File.OpenRead(fullFileName)
                        )));
 }
Example #9
0
        public ExtractionResult ExtractText(IndexingRequestDetails details, Stream stream)
        {
            using (var reader = new StreamReader(stream, details.Encoding, false, 16, true)) {
                var config = Configuration.Default.WithDefaultLoader();

                var document = new HtmlParser(config).Parse(reader.ReadToEnd());

                // using degrapher because AngleSharp uses recursion
                var returnResult = new ExtractionResult(details);
                if (FullDocumentCapture)
                {
                    returnResult.GenerateSegments(document.TextContent, HtmlDegrapher);
                }
                else
                {
                    returnResult.GenerateSegments(document, HtmlDegrapher);
                }
                returnResult.AnnotateSegments();

                return(returnResult);
            }
        }
 public IndexingRequest(IndexingRequestDetails details)
 {
     Details = details;
 }