Esempio n. 1
0
        private static IEnumerable<TextBlock> GetLinesGroups(TextLine[] lines, double maxDist, int maxDegreeOfParallelism)
        {
            /**************************************************************************************************
             * We want to measure the distance between two lines using the following method:
             *  We check if two lines are overlapping horizontally.
             *  If they are overlapping, we compute the middle point (new X coordinate) of the overlapping area.
             *  We finally compute the Euclidean distance between these two middle points.
             *  If the two lines are not overlapping, the distance is set to the max distance.
             **************************************************************************************************/

            double euclidianOverlappingMiddleDistance(PdfLine l1, PdfLine l2)
            {
                var left = Math.Max(l1.Point1.X, l2.Point1.X);
                var d = (Math.Min(l1.Point2.X, l2.Point2.X) - left);

                if (d < 0) return double.MaxValue; // not overlapping -> max distance

                return Distances.Euclidean(
                    new PdfPoint(left + d / 2, l1.Point1.Y),
                    new PdfPoint(left + d / 2, l2.Point1.Y));
            }

            var groupedIndexes = ClusteringAlgorithms.ClusterNearestNeighbours(lines,
                euclidianOverlappingMiddleDistance,
                (pivot, candidate) => maxDist,
                pivot => new PdfLine(pivot.BoundingBox.BottomLeft, pivot.BoundingBox.BottomRight),
                candidate => new PdfLine(candidate.BoundingBox.TopLeft, candidate.BoundingBox.TopRight),
                pivot => true, (pivot, candidate) => true,
                maxDegreeOfParallelism).ToList();

            for (int a = 0; a < groupedIndexes.Count; a++)
            {
                yield return new TextBlock(groupedIndexes[a].Select(i => lines[i]).ToList());
            }
        }
Esempio n. 2
0
 public void TestEuclidean()
 {
     int[,] tss        = { { 0, 1, 2, 3 }, { 4, 5, 6, 7 }, { 8, 9, 10, 11 } };
     float[,] expected = { { 0, 0, 0 }, { 8, 0, 0 }, { 16, 8, 0 } };
     using (KhivaArray arr = KhivaArray.Create(tss), euclidean = Distances.Euclidean(arr))
     {
         var result = euclidean.GetData2D <float>();
         Assert.AreEqual(expected, result);
     }
 }
Esempio n. 3
0
        // .NET Framework 3.5 project that implements the library targeting .NET Standard 2.0 - This will NOT work
        static void Main(string[] args)
        {
            List <double> p = new List <double> {
                2.5,
                2.7,
                3.5,
                7.4
            };

            List <double> q = new List <double> {
                27.5,
                12.7,
                41.5,
                35.4
            };

            Console.WriteLine($"Euclidean distance between p and q is {Distances.Euclidean(p, q)}");
            Console.ReadKey();
        }
Esempio n. 4
0
        /// <summary>
        /// Estimation of within-line and between-line spacing.
        /// <para>This is the Docstrum algorithm's 1st step.</para>
        /// </summary>
        /// <param name="words">The list of words.</param>
        /// <param name="wlBounds">Angle bounds for words to be considered as neighbours on the same line.</param>
        /// <param name="wlBinSize">The bin size used when building the within-line distances distribution.</param>
        /// <param name="blBounds">Angle bounds for words to be considered as neighbours on separate lines.</param>
        /// <param name="blBinSize">The bin size used when building the between-line distances distribution.</param>
        /// <param name="maxDegreeOfParallelism">Sets the maximum number of concurrent tasks enabled.
        /// <para>A positive property value limits the number of concurrent operations to the set value.
        /// If it is -1, there is no limit on the number of concurrently running operations.</para></param>
        /// <param name="withinLineDistance">The estimated within-line distance. Computed as the average peak value of distribution.</param>
        /// <param name="betweenLineDistance">The estimated between-line distance. Computed as the average peak value of distribution.</param>
        /// <returns>False if either 'withinLineDistance' or 'betweenLineDistance' is <see cref="double.NaN"/>.</returns>
        public static bool GetSpacingEstimation(IReadOnlyList <Word> words,
                                                AngleBounds wlBounds, int wlBinSize,
                                                AngleBounds blBounds, int blBinSize,
                                                int maxDegreeOfParallelism,
                                                out double withinLineDistance, out double betweenLineDistance)
        {
            ParallelOptions parallelOptions = new ParallelOptions()
            {
                MaxDegreeOfParallelism = maxDegreeOfParallelism
            };

            var withinLineDistList  = new ConcurrentBag <double>();
            var betweenLineDistList = new ConcurrentBag <double>();

            // 1. Estimate within line and between line spacing
            KdTree <Word> kdTreeBottomLeft = new KdTree <Word>(words, w => w.BoundingBox.BottomLeft);

            Parallel.For(0, words.Count, parallelOptions, i =>
            {
                var word = words[i];

                // Within-line distance
                // 1.1.1 Find the 2 closest neighbours words to the candidate, using euclidean distance.
                foreach (var n in kdTreeBottomLeft.FindNearestNeighbours(word, 2, w => w.BoundingBox.BottomRight, Distances.Euclidean))
                {
                    // 1.1.2 Check if the neighbour word is within the angle of the candidate
                    if (wlBounds.Contains(AngleWL(word, n.Item1)))
                    {
                        withinLineDistList.Add(Distances.Euclidean(word.BoundingBox.BottomRight, n.Item1.BoundingBox.BottomLeft));
                    }
                }

                // Between-line distance
                // 1.2.1 Find the 2 closest neighbours words to the candidate, using euclidean distance.
                foreach (var n in kdTreeBottomLeft.FindNearestNeighbours(word, 2, w => w.BoundingBox.TopLeft, Distances.Euclidean))
                {
                    // 1.2.2 Check if the candidate words is within the angle
                    var angle = AngleBL(word, n.Item1);
                    if (blBounds.Contains(angle))
                    {
                        // 1.2.3 Compute the vertical (between-line) distance between the candidate
                        // and the neighbour and add it to the between-line distances list
                        double hypotenuse = Distances.Euclidean(word.BoundingBox.Centroid, n.Item1.BoundingBox.Centroid);

                        // Angle is kept within [-90, 90]
                        if (angle > 90)
                        {
                            angle -= 180;
                        }

                        var dist = Math.Abs(hypotenuse * Math.Cos((90 - angle) * Math.PI / 180))
                                   - word.BoundingBox.Height / 2.0 - n.Item1.BoundingBox.Height / 2.0;

                        // The perpendicular distance can be negative because of the subtractions.
                        // Could occur when words are overlapping, we ignore that.
                        if (dist >= 0)
                        {
                            betweenLineDistList.Add(dist);
                        }
                    }
                }
            });

            // Compute average peak value of distribution
            double?withinLinePeak  = GetPeakAverageDistance(withinLineDistList, wlBinSize);
            double?betweenLinePeak = GetPeakAverageDistance(betweenLineDistList, blBinSize);

            withinLineDistance  = withinLinePeak ?? double.NaN;
            betweenLineDistance = betweenLinePeak ?? double.NaN;

            return(withinLinePeak.HasValue && betweenLinePeak.HasValue);
        }
Esempio n. 5
0
        private List <TableRectangle> getTableAreasFromCells(List <TableRectangle> cells)
        {
            List <List <TableRectangle> > cellGroups = new List <List <TableRectangle> >();

            foreach (TableRectangle cell in cells)
            {
                bool addedToGroup = false;

                foreach (List <TableRectangle> cellGroup in cellGroups)
                {
                    foreach (TableRectangle groupCell in cellGroup)
                    {
                        PdfPoint[] groupCellCorners = groupCell.Points;
                        PdfPoint[] candidateCorners = cell.Points;

                        for (int i = 0; i < candidateCorners.Length; i++)
                        {
                            for (int j = 0; j < groupCellCorners.Length; j++)
                            {
                                //if (candidateCorners[i].distance(groupCellCorners[j]) < CELL_CORNER_DISTANCE_MAXIMUM)
                                if (Distances.Euclidean(candidateCorners[i], groupCellCorners[j]) < CELL_CORNER_DISTANCE_MAXIMUM)
                                {
                                    cellGroup.Add(cell);
                                    addedToGroup = true;
                                    goto cellCheck;
                                }
                            }
                        }
                    }
                }

cellCheck:
                if (!addedToGroup)
                {
                    List <TableRectangle> cellGroup = new List <TableRectangle> {
                        cell
                    };
                    cellGroups.Add(cellGroup);
                }
            }

            // create table areas based on cell group
            List <TableRectangle> tableAreas = new List <TableRectangle>();

            foreach (List <TableRectangle> cellGroup in cellGroups)
            {
                // less than four cells should not make a table
                if (cellGroup.Count < REQUIRED_CELLS_FOR_TABLE)
                {
                    continue;
                }

                double top    = double.MinValue;    // bobld: MaxValue
                double left   = double.MaxValue;
                double bottom = double.MaxValue;    // bobld: MinValue
                double right  = double.MinValue;

                foreach (TableRectangle cell in cellGroup)
                {
                    if (cell.Top > top)
                    {
                        top = cell.Top;                             // bobld: <
                    }
                    if (cell.Left < left)
                    {
                        left = cell.Left;
                    }
                    if (cell.Bottom < bottom)
                    {
                        bottom = cell.Bottom;                       // bobld: >
                    }
                    if (cell.Right > right)
                    {
                        right = cell.Right;
                    }
                }

                tableAreas.Add(new TableRectangle(new PdfRectangle(left, bottom, right, top)));
            }

            return(tableAreas);
        }