Пример #1
0
        /// <summary>
        /// Get the words using options values.
        /// </summary>
        /// <param name="letters">The page's letters to group into <see cref="Word"/>s.</param>
        /// <param name="options">The <see cref="NearestNeighbourWordExtractorOptions"/> to use.</param>
        /// <returns>The <see cref="Word"/>s generated by the nearest neighbour method.</returns>
        public IEnumerable <Word> GetWords(IReadOnlyList <Letter> letters, DlaOptions options)
        {
            if (options is NearestNeighbourWordExtractorOptions nnOptions)
            {
                if (letters == null || letters.Count == 0)
                {
                    return(EmptyArray <Word> .Instance);
                }

                if (nnOptions.GroupByOrientation)
                {
                    // axis aligned
                    List <Word> words = GetWords(
                        letters.Where(l => l.TextOrientation == TextOrientation.Horizontal).ToList(),
                        nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot,
                        nnOptions.Filter, nnOptions.MaxDegreeOfParallelism);

                    words.AddRange(GetWords(
                                       letters.Where(l => l.TextOrientation == TextOrientation.Rotate270).ToList(),
                                       nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot,
                                       nnOptions.Filter, nnOptions.MaxDegreeOfParallelism));

                    words.AddRange(GetWords(
                                       letters.Where(l => l.TextOrientation == TextOrientation.Rotate180).ToList(),
                                       nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot,
                                       nnOptions.Filter, nnOptions.MaxDegreeOfParallelism));

                    words.AddRange(GetWords(
                                       letters.Where(l => l.TextOrientation == TextOrientation.Rotate90).ToList(),
                                       nnOptions.MaximumDistance, nnOptions.DistanceMeasureAA, nnOptions.FilterPivot,
                                       nnOptions.Filter, nnOptions.MaxDegreeOfParallelism));

                    // not axis aligned
                    words.AddRange(GetWords(
                                       letters.Where(l => l.TextOrientation == TextOrientation.Other).ToList(),
                                       nnOptions.MaximumDistance, nnOptions.DistanceMeasure, nnOptions.FilterPivot,
                                       nnOptions.Filter, nnOptions.MaxDegreeOfParallelism));

                    return(words);
                }
                else
                {
                    return(GetWords(letters,
                                    nnOptions.MaximumDistance, nnOptions.DistanceMeasure, nnOptions.FilterPivot,
                                    nnOptions.Filter, nnOptions.MaxDegreeOfParallelism));
                }
            }
            else
            {
                throw new ArgumentException("Options provided must be of type " + nameof(NearestNeighbourWordExtractorOptions) + ".", nameof(options));
            }
        }
Пример #2
0
        /// <summary>
        /// Get the text blocks using options.
        /// </summary>
        /// <param name="words">The page's words to generate text blocks for.</param>
        /// <param name="options">The <see cref="DefaultPageSegmenterOptions"/> to use.</param>
        /// <returns>The <see cref="TextBlock"/>s generated by the default method.</returns>
        public IReadOnlyList <TextBlock> GetBlocks(IEnumerable <Word> words, DlaOptions options)
        {
            if (options is DefaultPageSegmenterOptions dOptions)
            {
                if (words?.Any() != true)
                {
                    return(EmptyArray <TextBlock> .Instance);
                }

                return(new List <TextBlock>()
                {
                    new TextBlock(new XYLeaf(words).GetLines(dOptions.WordSeparator), dOptions.LineSeparator)
                });
            }
            else
            {
                throw new ArgumentException("Options provided must be of type " + nameof(DefaultPageSegmenterOptions) + ".", nameof(options));
            }
        }
Пример #3
0
        /// <summary>
        /// Get the blocks using options values.
        /// </summary>
        /// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
        /// <param name="options">The <see cref="RecursiveXYCutOptions"/> to use.</param>
        /// <returns>The <see cref="TextBlock"/>s generated by the Recursive X-Y cut method.</returns>
        public IReadOnlyList <TextBlock> GetBlocks(IEnumerable <Word> words, DlaOptions options)
        {
            if (options is RecursiveXYCutOptions ryxcOptions)
            {
                if (words?.Any() != true)
                {
                    return(EmptyArray <TextBlock> .Instance);
                }

                return(GetBlocks(words,
                                 ryxcOptions.MinimumWidth,
                                 ryxcOptions.DominantFontWidthFunc,
                                 ryxcOptions.DominantFontHeightFunc,
                                 ryxcOptions.WordSeparator,
                                 ryxcOptions.LineSeparator));
            }
            else
            {
                throw new ArgumentException("Options provided must be of type " + nameof(RecursiveXYCutOptions) + ".", nameof(options));
            }
        }
Пример #4
0
        /// <summary>
        /// Get the blocks using options values.
        /// </summary>
        /// <param name="words">The page's words to segment into <see cref="TextBlock"/>s.</param>
        /// <param name="options">The <see cref="DocstrumBoundingBoxesOptions"/> to use.</param>
        /// <returns>The <see cref="TextBlock"/>s generated by the document spectrum method.</returns>
        public IReadOnlyList <TextBlock> GetBlocks(IEnumerable <Word> words, DlaOptions options)
        {
            if (options is DocstrumBoundingBoxesOptions dbbOptions)
            {
                if (words?.Any() != true)
                {
                    return(EmptyArray <TextBlock> .Instance);
                }

                return(GetBlocks(words.ToList(),
                                 dbbOptions.WithinLineBounds, dbbOptions.WithinLineMultiplier, dbbOptions.WithinLineBinSize,
                                 dbbOptions.BetweenLineBounds, dbbOptions.BetweenLineMultiplier, dbbOptions.BetweenLineBinSize,
                                 dbbOptions.AngularDifferenceBounds,
                                 dbbOptions.Epsilon,
                                 dbbOptions.WordSeparator, dbbOptions.LineSeparator,
                                 dbbOptions.MaxDegreeOfParallelism));
            }
            else
            {
                throw new ArgumentException("Options provided must be of type " + nameof(DocstrumBoundingBoxesOptions) + ".", nameof(options));
            }
        }