private IEnumerable <TextPageData> GetPages(Database db, IEnumerable <WorkspaceResult> results) { var parser = new BracketPipeTextExtractor(); var l = new List <TextPageData>(); foreach (var result in results) { if (string.IsNullOrWhiteSpace(result.DataHash)) { continue; } var data = db.GetWebResourceCacheData(new MD5Hash(result.DataHash)).GetAwaiter().GetResult(); if (data == null) { continue; } var t = new List <BracketPipeTextFragment>(); using (var ms = new MemoryStream(data)) parser.Parse(ms, (x) => { var y = FilterRawTextFragments(x); t.AddIfNotNull(y); }); l.Add(new TextPageData { TextData = t.Aggregate("", (x, y) => string.Format("{0} {1} {2}", x, y.Tag, y.Text)), Category = MakeCategory(result) }); } return(l); }
public override async Task ProcessResponseStream(Stream dataStream) { if (ContentType.IsTextType || ContentType.IsXmlType) { writtenUri = false; var parser = new BracketPipeTextExtractor { Distinct = true, Granularity = ExtractionGranularity.Raw, MaximumLength = int.MaxValue, MinimumLength = int.MinValue, StopWords = false }; parser.Parse(dataStream, WriteExtractedText); } }
public async Task <HttpResponseMessage> GetWebResourceCacheDataText( string datahash, BracketPipeTextExtractorFilterType filter = BracketPipeTextExtractorFilterType.Raw, int minlen = int.MinValue, int maxlen = int.MaxValue, bool distinct = true, bool stopWords = true, ExtractionGranularity granularity = ExtractionGranularity.Raw ) { try { using (var db = new Database()) { byte[] bytes = await db.GetWebResourceCacheData(new MD5Hash(datahash)); if (bytes == null) { return(Create404Response((object)null)); } var l = new List <BracketPipeTextFragment>(); using (var ms = new MemoryStream(bytes)) { var parser = new BracketPipeTextExtractor { Distinct = distinct, Granularity = granularity, MaximumLength = maxlen, MinimumLength = minlen, StopWords = stopWords, Filter = filter }; parser.Parse(ms, l.Add); } return(CreateOKResponse(l)); } } catch (Exception ex) { return(CreateExceptionResponse(ex)); } }
public void RegexSlowTest() { const string RegexSlowTestFilePath = @"testdata\FilterTest\RegexSlowTest.html"; const string ExampleToken = @"regex:(\s*\w*\s*){2,6}\scoffee\s(\s*\w*\s*){2,6}:(\s*\w*\s*){2,6}\scoffee\s(\s*\w*\s*){2,6}"; var f = Filter.CreateFilter(ExampleToken, 0); Assert.IsInstanceOfType(f, typeof(RegexFilter)); var regex = f as RegexFilter; Assert.IsTrue(regex.RegexPattern == ExampleToken.Substring(6)); var fragment = BracketPipeTextExtractor.ReadAllText(File.Open(RegexSlowTestFilePath, FileMode.Open)); var tags = regex.IsMatch(null, fragment.Aggregate("", (x, y) => x + " " + y), null); foreach (var tag in tags) { Console.WriteLine(tags); } }