public async Task <ActionResult <ParsingResult> > ProcessFile([FromBody] ParsingRequest request) { string fullPath = handler.GetFileName(request.Name); using (var stream = new FileStream(fullPath, FileMode.Create)) { await stream.WriteAsync(request.Data, 0, request.Data.Length).ConfigureAwait(false); } var fileInfo = new FileInfo(fullPath); var parser = parserFactory.ConstructParsers(fileInfo); if (parser is NullTextParser) { return(StatusCode(500, "Can't process this type of file")); } ParsingType type; switch (request.Type) { case RequestParsingType.Extract: type = ParsingType.Extract; break; case RequestParsingType.OCR: type = ParsingType.OCR; break; case RequestParsingType.Any: type = ParsingType.Any; break; default: throw new ArgumentOutOfRangeException(); } var orcRequest = new Readers.ParsingRequest(fileInfo, type, 50); var result = await parser.Parse(orcRequest).ConfigureAwait(false); var parsingResult = new ParsingResult(); parsingResult.Document = result.Document; parsingResult.Type = result.ProcessedAs?.ToString(); parsingResult.Name = fileInfo.Name; return(Ok(parsingResult)); }
public async Task <DocumentDefinition> ParseDocument(DirectoryInfo repositoryPath, FileInfo file, CancellationToken token) { logger.Debug("ParseDocument: {0}", file); Guard.NotNull(() => file, file); Guard.NotNull(() => repositoryPath, repositoryPath); Guard.IsValid(() => file, file, info => info.Exists, "invalid file"); var parser = textParserFactory.ConstructParsers(file); if (parser == NullTextParser.Instance) { logger.Debug("Null parser: {0}", file); return(null); } var text = parser.Parse(); DocumentDefinition definition = new DocumentDefinition(); var bytes = File.ReadAllBytes(file.FullName); token.ThrowIfCancellationRequested(); definition.Crc32 = Crc32CAlgorithm.Compute(bytes); string path = string.IsNullOrEmpty(repositoryPath.FullName) || repositoryPath.FullName[repositoryPath.FullName.Length - 1] == Path.DirectorySeparatorChar ? repositoryPath.FullName : $"{repositoryPath.FullName}{Path.DirectorySeparatorChar}"; var directory = path.GetRelativePath(file.DirectoryName); if (directory == string.Empty) { if (file.Directory != null) { definition.Labels = new[] { file.Directory.Name }; } } else { definition.Labels = new[] { directory.Split(Path.DirectorySeparatorChar).First() }; } definition.Labels = definition.Labels.Select(item => item.CreateLetterText()).ToArray(); definition.Path = file.FullName; if (!string.IsNullOrWhiteSpace(text)) { var result = await textSplitter.Splitter.Process(new ParseRequest(text)).ConfigureAwait(false); token.ThrowIfCancellationRequested(); var review = result.GetReview(textSplitter.DataLoader); token.ThrowIfCancellationRequested(); var words = review.Items.Where( item => item.POS.WordType != WordType.Number && item.POS.WordType != WordType.SeparationSymbol && item.POS.WordType != WordType.Symbol && item.POS.WordType != WordType.Conjunction && item.POS.WordType != WordType.Sentence && !item.IsStopWord) .Select(item => item.Text) .ToArray(); foreach (var word in words) { token.ThrowIfCancellationRequested(); string underlyingWord; if (!wordsTable.TryGetAddItem(word, word, out underlyingWord)) { underlyingWord = word; } var total = definition.WordsTable.GetSafe(underlyingWord); total++; definition.WordsTable[underlyingWord] = total; } } return(definition); }