예제 #1
0
        private IEnumerable <TextPageData> GetPages(Database db, IEnumerable <WorkspaceResult> results)
        {
            var parser = new BracketPipeTextExtractor();

            var l = new List <TextPageData>();

            foreach (var result in results)
            {
                if (string.IsNullOrWhiteSpace(result.DataHash))
                {
                    continue;
                }
                var data = db.GetWebResourceCacheData(new MD5Hash(result.DataHash)).GetAwaiter().GetResult();

                if (data == null)
                {
                    continue;
                }

                var t = new List <BracketPipeTextFragment>();
                using (var ms = new MemoryStream(data))
                    parser.Parse(ms, (x) => { var y = FilterRawTextFragments(x); t.AddIfNotNull(y); });

                l.Add(new TextPageData
                {
                    TextData = t.Aggregate("", (x, y) => string.Format("{0} {1} {2}", x, y.Tag, y.Text)),
                    Category = MakeCategory(result)
                });
            }

            return(l);
        }
예제 #2
0
 public override async Task ProcessResponseStream(Stream dataStream)
 {
     if (ContentType.IsTextType || ContentType.IsXmlType)
     {
         writtenUri = false;
         var parser = new BracketPipeTextExtractor
         {
             Distinct      = true,
             Granularity   = ExtractionGranularity.Raw,
             MaximumLength = int.MaxValue,
             MinimumLength = int.MinValue,
             StopWords     = false
         };
         parser.Parse(dataStream, WriteExtractedText);
     }
 }
예제 #3
0
        public async Task <HttpResponseMessage> GetWebResourceCacheDataText(
            string datahash,
            BracketPipeTextExtractorFilterType filter = BracketPipeTextExtractorFilterType.Raw,
            int minlen     = int.MinValue,
            int maxlen     = int.MaxValue,
            bool distinct  = true,
            bool stopWords = true,
            ExtractionGranularity granularity = ExtractionGranularity.Raw
            )
        {
            try
            {
                using (var db = new Database())
                {
                    byte[] bytes = await db.GetWebResourceCacheData(new MD5Hash(datahash));

                    if (bytes == null)
                    {
                        return(Create404Response((object)null));
                    }

                    var l = new List <BracketPipeTextFragment>();
                    using (var ms = new MemoryStream(bytes))
                    {
                        var parser = new BracketPipeTextExtractor
                        {
                            Distinct      = distinct,
                            Granularity   = granularity,
                            MaximumLength = maxlen,
                            MinimumLength = minlen,
                            StopWords     = stopWords,
                            Filter        = filter
                        };

                        parser.Parse(ms, l.Add);
                    }

                    return(CreateOKResponse(l));
                }
            }
            catch (Exception ex)
            {
                return(CreateExceptionResponse(ex));
            }
        }
예제 #4
0
        public void RegexSlowTest()
        {
            const string RegexSlowTestFilePath = @"testdata\FilterTest\RegexSlowTest.html";
            const string ExampleToken          = @"regex:(\s*\w*\s*){2,6}\scoffee\s(\s*\w*\s*){2,6}:(\s*\w*\s*){2,6}\scoffee\s(\s*\w*\s*){2,6}";
            var          f = Filter.CreateFilter(ExampleToken, 0);

            Assert.IsInstanceOfType(f, typeof(RegexFilter));

            var regex = f as RegexFilter;

            Assert.IsTrue(regex.RegexPattern == ExampleToken.Substring(6));

            var fragment = BracketPipeTextExtractor.ReadAllText(File.Open(RegexSlowTestFilePath, FileMode.Open));
            var tags     = regex.IsMatch(null, fragment.Aggregate("", (x, y) => x + " " + y), null);

            foreach (var tag in tags)
            {
                Console.WriteLine(tags);
            }
        }