Beispiel #1
0
        public async Task <ActionResult <ParsingResult> > ProcessFile([FromBody] ParsingRequest request)
        {
            string fullPath = handler.GetFileName(request.Name);

            using (var stream = new FileStream(fullPath, FileMode.Create))
            {
                await stream.WriteAsync(request.Data, 0, request.Data.Length).ConfigureAwait(false);
            }

            var fileInfo = new FileInfo(fullPath);
            var parser   = parserFactory.ConstructParsers(fileInfo);

            if (parser is NullTextParser)
            {
                return(StatusCode(500, "Can't process this type of file"));
            }

            ParsingType type;

            switch (request.Type)
            {
            case RequestParsingType.Extract:
                type = ParsingType.Extract;
                break;

            case RequestParsingType.OCR:
                type = ParsingType.OCR;
                break;

            case RequestParsingType.Any:
                type = ParsingType.Any;
                break;

            default:
                throw new ArgumentOutOfRangeException();
            }

            var orcRequest = new Readers.ParsingRequest(fileInfo, type, 50);
            var result     = await parser.Parse(orcRequest).ConfigureAwait(false);

            var parsingResult = new ParsingResult();

            parsingResult.Document = result.Document;
            parsingResult.Type     = result.ProcessedAs?.ToString();
            parsingResult.Name     = fileInfo.Name;
            return(Ok(parsingResult));
        }
Beispiel #2
0
        public async Task <DocumentDefinition> ParseDocument(DirectoryInfo repositoryPath, FileInfo file, CancellationToken token)
        {
            logger.Debug("ParseDocument: {0}", file);
            Guard.NotNull(() => file, file);
            Guard.NotNull(() => repositoryPath, repositoryPath);
            Guard.IsValid(() => file, file, info => info.Exists, "invalid file");

            var parser = textParserFactory.ConstructParsers(file);

            if (parser == NullTextParser.Instance)
            {
                logger.Debug("Null parser: {0}", file);
                return(null);
            }

            var text = parser.Parse();
            DocumentDefinition definition = new DocumentDefinition();
            var bytes = File.ReadAllBytes(file.FullName);

            token.ThrowIfCancellationRequested();
            definition.Crc32 = Crc32CAlgorithm.Compute(bytes);
            string path = string.IsNullOrEmpty(repositoryPath.FullName) || repositoryPath.FullName[repositoryPath.FullName.Length - 1] == Path.DirectorySeparatorChar
                              ? repositoryPath.FullName
                              : $"{repositoryPath.FullName}{Path.DirectorySeparatorChar}";
            var directory = path.GetRelativePath(file.DirectoryName);

            if (directory == string.Empty)
            {
                if (file.Directory != null)
                {
                    definition.Labels = new[] { file.Directory.Name };
                }
            }
            else
            {
                definition.Labels = new[] { directory.Split(Path.DirectorySeparatorChar).First() };
            }

            definition.Labels = definition.Labels.Select(item => item.CreateLetterText()).ToArray();
            definition.Path   = file.FullName;

            if (!string.IsNullOrWhiteSpace(text))
            {
                var result = await textSplitter.Splitter.Process(new ParseRequest(text)).ConfigureAwait(false);

                token.ThrowIfCancellationRequested();
                var review = result.GetReview(textSplitter.DataLoader);
                token.ThrowIfCancellationRequested();
                var words = review.Items.Where(
                    item =>
                    item.POS.WordType != WordType.Number &&
                    item.POS.WordType != WordType.SeparationSymbol &&
                    item.POS.WordType != WordType.Symbol &&
                    item.POS.WordType != WordType.Conjunction &&
                    item.POS.WordType != WordType.Sentence &&
                    !item.IsStopWord)
                            .Select(item => item.Text)
                            .ToArray();

                foreach (var word in words)
                {
                    token.ThrowIfCancellationRequested();
                    string underlyingWord;
                    if (!wordsTable.TryGetAddItem(word, word, out underlyingWord))
                    {
                        underlyingWord = word;
                    }

                    var total = definition.WordsTable.GetSafe(underlyingWord);
                    total++;
                    definition.WordsTable[underlyingWord] = total;
                }
            }

            return(definition);
        }