示例#1
0
        public double Similarity(ExelRow exelRow, int maxColumnsCount = 0)
        {
            double result     = 0.0;
            bool   wasError   = false;
            var    logSession = Helpers.Old.Log.SessionStart("ExelRow.Similarity()", true);

            try
            {
                if (maxColumnsCount == 0)
                {
                    maxColumnsCount = int.MaxValue;
                }

                //if (exelRow.Cells.Count == this.Cells.Count)
                //{

                int maxValue = Math.Min(Math.Min(exelRow.Cells.Count, this.Cells.Count), maxColumnsCount);

                if (maxValue > 0)
                {
                    for (int i = 0; i < maxValue; i++)
                    {
                        result += exelRow.Cells[i].IsMerged == this.Cells[i].IsMerged ? 0.09 : 0;
                        result += string.IsNullOrEmpty(exelRow.Cells[i].Value) == string.IsNullOrEmpty(this.Cells[i].Value) ? 0.44 : 0.0;
                        result += Math.Abs(exelRow.Cells[i].UniqueWeight - this.Cells[i].UniqueWeight) < 0.07 ? 0.15 : 0.0;
                        result += AsyncDocumentLoader.ColorsEqual(exelRow.Cells[i].CellStyle.ForegroundColor, this.Cells[i].CellStyle.ForegroundColor) ? 0.16 : 0.0;
                        result += AsyncDocumentLoader.ColorsEqual(exelRow.Cells[i].CellStyle.BackgroundColor, this.Cells[i].CellStyle.BackgroundColor) ? 0.16 : 0.0;
                    }
                }
                result = result / maxValue;


                //}

                if (maxColumnsCount == int.MaxValue)
                {
                    result  = result * 0.55;
                    result += ((double)Math.Min(this.NotEmptyCells.Count(), exelRow.NotEmptyCells.Count()) / (double)Math.Max(this.NotEmptyCells.Count(), exelRow.NotEmptyCells.Count())) * 0.4;

                    //result += (this.NotEmptyCells.Count() == exelRow.NotEmptyCells.Count()) ? 0.4 : 0;
                    result += (this.UniqueNotEmptyCells.Count() == exelRow.UniqueNotEmptyCells.Count()) ? 0.05 : 0;
                }
            }
            catch (Exception ex)
            {
                wasError = true;
                Helpers.Old.Log.Add(logSession, ex);
                throw ex;
            }
            finally
            {
                Helpers.Old.Log.SessionEnd(logSession, wasError);
            }
            return(result);
        }
示例#2
0
        public double Similarity(ExelRow exelRow, int maxColumnsCount = 0)
        {
            double result = 0.0;
            bool wasError = false;
            var logSession = Helpers.Old.Log.SessionStart("ExelRow.Similarity()", true);
            try
            {
                if (maxColumnsCount == 0)
                    maxColumnsCount = int.MaxValue;

                //if (exelRow.Cells.Count == this.Cells.Count)
                //{

                int maxValue = Math.Min(Math.Min(exelRow.Cells.Count, this.Cells.Count), maxColumnsCount);

                if (maxValue > 0)
                    for (int i = 0; i < maxValue; i++)
                    {
                        result += exelRow.Cells[i].IsMerged == this.Cells[i].IsMerged ? 0.09 : 0;
                        result += string.IsNullOrEmpty(exelRow.Cells[i].Value) == string.IsNullOrEmpty(this.Cells[i].Value) ? 0.44 : 0.0;
                        result += Math.Abs(exelRow.Cells[i].UniqueWeight - this.Cells[i].UniqueWeight) < 0.07 ? 0.15 : 0.0;
                        result += AsyncDocumentLoader.ColorsEqual(exelRow.Cells[i].CellStyle.ForegroundColor,this.Cells[i].CellStyle.ForegroundColor) ? 0.16 : 0.0;
                        result += AsyncDocumentLoader.ColorsEqual(exelRow.Cells[i].CellStyle.BackgroundColor, this.Cells[i].CellStyle.BackgroundColor) ? 0.16 : 0.0;
                    }
                result = result / maxValue;

                //}

                if (maxColumnsCount == int.MaxValue)
                {
                    result = result * 0.55;
                    result += ((double)Math.Min(this.NotEmptyCells.Count(), exelRow.NotEmptyCells.Count()) / (double)Math.Max(this.NotEmptyCells.Count(), exelRow.NotEmptyCells.Count())) * 0.4;

                    //result += (this.NotEmptyCells.Count() == exelRow.NotEmptyCells.Count()) ? 0.4 : 0;
                    result += (this.UniqueNotEmptyCells.Count() == exelRow.UniqueNotEmptyCells.Count()) ? 0.05 : 0;
                }
            }
            catch(Exception ex)
            {
                wasError = true;
                Helpers.Old.Log.Add(logSession, ex);
                throw ex;
            }
            finally
            {
                Helpers.Old.Log.SessionEnd(logSession, wasError);
            }
            return result;
        }
示例#3
0
        public static ExelRow[] LoadRows(Worksheet sheet, int count = 0, Action <int> progressReport = null, bool deleteEmptyRows = true)
        {
            var result         = new List <ExelRow>();
            var totalRowsCount = 0;
            var loaded         = 0;

            var pp = new Helpers.PercentageProgress();

            if (progressReport != null)
            {
                pp.Change += (s, e) => { progressReport((int)e.Value); }
            }
            ;

            var pp0 = pp.GetChild(weight: 9);
            var pp1 = pp.GetChild(weight: 1);

            try
            {
                totalRowsCount = Math.Max(sheet.Cells.MaxRow + 1, sheet.Cells.Rows.Count);
            }
            catch
            {
                totalRowsCount = sheet.Cells.Rows.Count;
            }


            if (count <= 0)
            {
                count = totalRowsCount;
            }

            #region Select max column index for sheet
            int maxColumnIndex = sheet.Cells.Rows.Cast <Row>().Take(count).Select(c =>
            {
                if (c.LastCell != null)
                {
                    for (int i = c.LastCell.Column; i >= 0; i--)
                    {
                        var cell = c.GetCellOrNull(i);
                        if (!string.IsNullOrWhiteSpace(
                                cell == null || cell.Value == null
                                    ? null
                                    : cell.Value.ToString().Trim()
                                )
                            )
                        {
                            return(cell.IsMerged ? cell.GetMergedRange().FirstColumn + cell.GetMergedRange().ColumnCount - 1 : i);
                        }
                    }
                }
                return(0);
            }
                                                                                  )
                                 .Union(new int[] { 0 })
                                 .Max();
            #endregion
            #region Read rows

            if (sheet.Cells.Count > 0)
            {
                var lockObj  = new Object();
                var lockCell = new Object();
                sheet.Cells.MultiThreadReading = true;

                var items =
                    sheet.Cells.Rows.Cast <Row>()
                    .Where(r => r.Index < count)
                    .AsParallel()
                    .Select(row =>
                {
                    Thread.CurrentThread.CurrentCulture   = CultureInfo.GetCultureInfo("ru-RU");
                    Thread.CurrentThread.CurrentUICulture = Thread.CurrentThread.CurrentCulture;

                    var r = new ExelRow()
                    {
                        Index = row.Index
                    };
                    var currMaxColumnsIndex = row.LastCell == null ? 0 : row.LastCell.Column;

                    int lastFilledColumnIndex = Math.Min(currMaxColumnsIndex, maxColumnIndex);

                    #region Read data from cells

                    var cells = Enumerable.Range(0, lastFilledColumnIndex + 1)
                                .Select(cellIndex => new { OriginalCell = row.GetCellOrNull(cellIndex), Index = cellIndex })
                                .Select(c => new { OriginalCell = c.OriginalCell, ResultCell = new ExelCell()
                                                   {
                                                       Value = string.Empty, FormatedValue = string.Empty, CellStyle = new Style(), OriginalIndex = c.Index
                                                   } })
                                .Select(i =>
                    {
                        if (i.OriginalCell != null)
                        {
                            lock (lockCell)
                                i.ResultCell.FormatedValue = i.OriginalCell.StringValue;

                            var link = GetHyperlinkForCell(i.OriginalCell, sheet);
                            if (link != null)
                            {
                                i.ResultCell.HyperLink = link.Address;
                            }
                            else
                            {
                                var formula = string.Empty;
                                lock (lockCell)
                                    formula = i.OriginalCell.Formula;

                                if (!string.IsNullOrWhiteSpace(formula))
                                {
                                    i.ResultCell.HyperLink = formula.Split(new char[] { '\"' }).FirstOrDefault(str => str.Contains("http"));
                                }
                            }

                            if (i.OriginalCell.IsMerged)
                            {
                                i.ResultCell.IsMerged = true;

                                var content = string.Empty;
                                var values  = (IEnumerable)i.OriginalCell.GetMergedRange().Value;
                                if (values != null)
                                {
                                    foreach (var value in values)
                                    {
                                        content += value;
                                    }
                                }

                                i.ResultCell.Value = content;
                            }
                            else if (i.OriginalCell.Value != null)
                            {
                                i.ResultCell.Value = i.OriginalCell.Value.ToString();
                            }
                            else
                            {
                                i.ResultCell.Value = string.Empty;
                            }

                            try
                            {
                                var comment = sheet.Comments[i.OriginalCell.Row, i.ResultCell.OriginalIndex];
                                if (comment != null)
                                {
                                    i.ResultCell.Comment = comment.Note;
                                }
                            }
                            catch { }

                            var style = i.OriginalCell.GetStyle();
                            i.ResultCell.CellStyle = style;
                            i.ResultCell.Color     = (style != null) ? (DefColors.Any(clr => ColorsEqual(clr, style.BackgroundColor)) ? style.ForegroundColor : style.BackgroundColor) : System.Drawing.Color.White;
                            i.ResultCell.Color     = System.Drawing.Color.FromArgb((i.ResultCell.Color.R > byte.MinValue || i.ResultCell.Color.G > byte.MinValue || i.ResultCell.Color.B > byte.MinValue) && i.ResultCell.Color.A == byte.MinValue ? byte.MaxValue : i.ResultCell.Color.A, i.ResultCell.Color.R, i.ResultCell.Color.G, i.ResultCell.Color.B);
                        }
                        return(new { ResultCell = i.ResultCell, Index = i.ResultCell.OriginalIndex });
                    })
                                .OrderBy(i => i.Index)
                                .Select(i => i.ResultCell)
                                .ToArray();
                    r.Cells.AddRange(cells);

                    #endregion

                    var lastStyle = (maxColumnIndex > 0) ? r.Cells[lastFilledColumnIndex].CellStyle : new Style();

                    for (int i = r.Cells.Count; i <= maxColumnIndex; i++)
                    {
                        r.Cells.Add(new ExelCell()
                        {
                            Value = string.Empty, CellStyle = lastStyle
                        });
                    }

                    lock (lockObj)
                    {
                        loaded++;
                        pp0.Value = (decimal)loaded * 100m / (decimal)count;
                    }

                    return(r);
                })
                    .OrderBy(r => r.Index)
                    .Where(r => !deleteEmptyRows || !r.IsEmpty)         // delete empty rows
                    .ToArray();
                result.AddRange(items);
            }

            #endregion

            if (result.Count > 0)
            {
                #region Delete bottom

                //#### Try to delete bottom info ####
                //get last empty index

                var lastEmptyRow             = result.LastOrDefault(row => row.IsEmpty);
                int lastEmptyRowsInDataIndex = lastEmptyRow == null ? 0 : result.IndexOf(lastEmptyRow);

                //int lastEmptyRowsInDataIndex =
                //    result
                //    .Where(row => row.IsEmpty)
                //    .Select(row => result.IndexOf(row))
                //    .OrderByDescending(i => i)
                //    .FirstOrDefault();

                //get last non empty index before last empty to check similarity
                var lastNotEmptyRow = result.Take(lastEmptyRowsInDataIndex == 0 ? result.Count : lastEmptyRowsInDataIndex).LastOrDefault(row => !row.IsEmpty);
                int lastNotEmptyRowsInDataIndexBeforeEmpty = lastNotEmptyRow == null ? 0 : result.IndexOf(lastNotEmptyRow);

                //int lastNotEmptyRowsInDataIndexBeforeEmpty =
                //    result
                //    .Where(row => !row.IsEmpty)
                //    .Select(row => result.IndexOf(row))
                //    .Where(i => i < lastEmptyRowsInDataIndex)
                //    .OrderByDescending(i => i)
                //    .FirstOrDefault();

                if (lastEmptyRowsInDataIndex > 4 && result.Count - lastEmptyRowsInDataIndex < 15)
                {
                    List <ExelRow> similarityIndexes = new List <ExelRow>();
                    for (int i = lastNotEmptyRowsInDataIndexBeforeEmpty; i >= 0 && i > lastNotEmptyRowsInDataIndexBeforeEmpty - 10; i--)
                    {
                        similarityIndexes.Add(result[i]);
                    }

                    for (int z = result.Count - 1; z >= lastEmptyRowsInDataIndex; z--)
                    {
                        if (result[z].NotEmptyCells.Count() <= 4 && !similarityIndexes.Select(s => s.Similarity(result[z])).Any(d => d > 0.6))
                        {
                            result.RemoveAt(z);
                        }
                        else
                        {
                            break;
                        }
                    }
                }

                pp1.Value = 25;

                var isRowsSimilar = new Func <ExelRow, ExelRow, bool>((r1, r2) =>
                {
                    var cnt = Math.Min(r1.Cells.Count, r2.Cells.Count);
                    for (int i = 0; i < cnt; i++)
                    {
                        if (string.Compare(r1.Cells[i].Value.Trim(), r2.Cells[i].Value.Trim(), true) != 0)
                        {
                            return(false);
                        }
                    }
                    return(true);
                });

                var defStart = 4;
                var minCountForDeleteSimilarity = Properties.Settings.Default.MaxRowsInGroupCountToDeleteSimilarityRows;
                for (int i = result.Count - 1; i >= defStart; i--)
                {
                    var n = 1;
                    if ((i - n >= defStart) && result[i].Similarity(result[i - n]) >= 0.8 && isRowsSimilar(result[i], result[i - n]))
                    {
                        var rowIndexes = new List <int>(new int[] { i - n });
                        do
                        {
                            n++;
                            rowIndexes.Add(i - n);
                        } while ((i - n >= defStart) && result[i].Similarity(result[i - n]) >= 0.8 && isRowsSimilar(result[i], result[i - n]));

                        if (n + 1 >= minCountForDeleteSimilarity)
                        {
                            rowIndexes
                            .OrderByDescending(ind => ind)
                            .ToList()
                            .ForEach(ind => result.RemoveAt(ind));
                            i -= n;
                            continue;
                            //result.RemoveAt(i);
                        }
                    }
                }

                pp1.Value = 50;

                #endregion

                //Delete all empty rows from data
                if (deleteEmptyRows)
                {
                    for (int z = result.Count - 1; z >= 0; z--)
                    {
                        var r1 = result[z];
                        if (r1.IsEmpty)
                        {
                            result.RemoveAt(z);
                        }
                    }
                }

                pp1.Value = 100;
            }
            return(result.ToArray());
        }
        public static ExelRow[] LoadRows(Worksheet sheet, int count = 0, Action<int> progressReport = null, bool deleteEmptyRows = true)
        {
            var result = new List<ExelRow>();
            var totalRowsCount = 0;
            var loaded = 0;

            var pp = new Helpers.PercentageProgress();
            if (progressReport != null)
                pp.Change += (s, e) => { progressReport((int)e.Value); };

            var pp0 = pp.GetChild(weight: 9);
            var pp1 = pp.GetChild(weight: 1);

            try
            {
                totalRowsCount = Math.Max(sheet.Cells.MaxRow + 1, sheet.Cells.Rows.Count);
            }
            catch
            {
                totalRowsCount = sheet.Cells.Rows.Count;
            }

            if (count <= 0)
               count = totalRowsCount;

            #region Select max column index for sheet
            int maxColumnIndex = sheet.Cells.Rows.Cast<Row>().Take(count).Select(c =>
            {
                if (c.LastCell != null)
                    for (int i = c.LastCell.Column; i >= 0; i--)
                    {
                        var cell = c.GetCellOrNull(i);
                        if (!string.IsNullOrWhiteSpace(
                                    cell == null || cell.Value == null
                                    ? null
                                    : cell.Value.ToString().Trim()
                                )
                            )
                            return cell.IsMerged ? cell.GetMergedRange().FirstColumn + cell.GetMergedRange().ColumnCount - 1 : i;
                    }
                return 0;
            }
                )
                .Union(new int[] { 0 })
                .Max();
            #endregion
            #region Read rows

            if (sheet.Cells.Count > 0)
            {
                var lockObj = new Object();
                var lockCell = new Object();
                sheet.Cells.MultiThreadReading = true;

                var items =
                        sheet.Cells.Rows.Cast<Row>()
                            .Where(r => r.Index < count)
                            .AsParallel()
                            .Select(row =>
                            {
                                Thread.CurrentThread.CurrentCulture = CultureInfo.GetCultureInfo("ru-RU");
                                Thread.CurrentThread.CurrentUICulture = Thread.CurrentThread.CurrentCulture;

                                var r = new ExelRow() { Index = row.Index };
                                var currMaxColumnsIndex = row.LastCell == null ? 0 : row.LastCell.Column;

                                int lastFilledColumnIndex = Math.Min(currMaxColumnsIndex, maxColumnIndex);

                                #region Read data from cells

                                var cells = Enumerable.Range(0, lastFilledColumnIndex + 1)
                                    .Select(cellIndex => new { OriginalCell = row.GetCellOrNull(cellIndex), Index = cellIndex })
                                    .Select(c => new { OriginalCell = c.OriginalCell, ResultCell = new ExelCell() { Value = string.Empty, FormatedValue = string.Empty, CellStyle = new Style(), OriginalIndex = c.Index } })
                                    .Select(i =>
                                        {
                                            if (i.OriginalCell != null)
                                            {
                                                lock(lockCell)
                                                    i.ResultCell.FormatedValue = i.OriginalCell.StringValue;

                                                var link = GetHyperlinkForCell(i.OriginalCell, sheet);
                                                if (link != null)
                                                    i.ResultCell.HyperLink = link.Address;
                                                else
                                                {
                                                    var formula = string.Empty;
                                                    lock(lockCell)
                                                        formula = i.OriginalCell.Formula;

                                                    if (!string.IsNullOrWhiteSpace(formula))
                                                        i.ResultCell.HyperLink = formula.Split(new char[] { '\"' }).FirstOrDefault(str => str.Contains("http"));
                                                }

                                                if (i.OriginalCell.IsMerged)
                                                {
                                                    i.ResultCell.IsMerged = true;

                                                    var content = string.Empty;
                                                    var values = (IEnumerable)i.OriginalCell.GetMergedRange().Value;
                                                    if (values != null)
                                                        foreach (var value in values)
                                                            content += value;

                                                    i.ResultCell.Value = content;
                                                }
                                                else if (i.OriginalCell.Value != null)
                                                {
                                                    i.ResultCell.Value = i.OriginalCell.Value.ToString();
                                                }
                                                else
                                                {
                                                    i.ResultCell.Value = string.Empty;
                                                }

                                                try
                                                {
                                                    var comment = sheet.Comments[i.OriginalCell.Row, i.ResultCell.OriginalIndex];
                                                    if (comment != null)
                                                        i.ResultCell.Comment = comment.Note;
                                                }
                                                catch { }

                                                var style = i.OriginalCell.GetStyle();
                                                i.ResultCell.CellStyle = style;
                                                i.ResultCell.Color = (style != null) ? (DefColors.Any(clr => ColorsEqual(clr, style.BackgroundColor)) ? style.ForegroundColor : style.BackgroundColor) : System.Drawing.Color.White;
                                                i.ResultCell.Color = System.Drawing.Color.FromArgb((i.ResultCell.Color.R > byte.MinValue || i.ResultCell.Color.G > byte.MinValue || i.ResultCell.Color.B > byte.MinValue) && i.ResultCell.Color.A == byte.MinValue ? byte.MaxValue : i.ResultCell.Color.A, i.ResultCell.Color.R, i.ResultCell.Color.G, i.ResultCell.Color.B);
                                            }
                                            return new { ResultCell = i.ResultCell, Index = i.ResultCell.OriginalIndex };
                                        })
                                    .OrderBy(i => i.Index)
                                    .Select(i => i.ResultCell)
                                    .ToArray();
                                r.Cells.AddRange(cells);

                                #endregion

                                var lastStyle = (maxColumnIndex > 0) ? r.Cells[lastFilledColumnIndex].CellStyle : new Style();

                                for (int i = r.Cells.Count; i <= maxColumnIndex; i++)
                                    r.Cells.Add(new ExelCell() { Value = string.Empty, CellStyle = lastStyle });

                                lock (lockObj)
                                {
                                    loaded++;
                                    pp0.Value = (decimal)loaded * 100m / (decimal)count;
                                }

                                return r;
                            })
                            .OrderBy(r => r.Index)
                            .Where(r => !deleteEmptyRows || !r.IsEmpty) // delete empty rows
                            .ToArray();
                result.AddRange(items);
            }

            #endregion

            if (result.Count > 0)
            {
                #region Delete bottom

                //#### Try to delete bottom info ####
                //get last empty index

                var lastEmptyRow = result.LastOrDefault(row => row.IsEmpty);
                int lastEmptyRowsInDataIndex = lastEmptyRow == null ? 0 : result.IndexOf(lastEmptyRow);

                //int lastEmptyRowsInDataIndex =
                //    result
                //    .Where(row => row.IsEmpty)
                //    .Select(row => result.IndexOf(row))
                //    .OrderByDescending(i => i)
                //    .FirstOrDefault();

                //get last non empty index before last empty to check similarity
                var lastNotEmptyRow = result.Take(lastEmptyRowsInDataIndex == 0 ? result.Count : lastEmptyRowsInDataIndex).LastOrDefault(row => !row.IsEmpty);
                int lastNotEmptyRowsInDataIndexBeforeEmpty = lastNotEmptyRow == null ? 0 : result.IndexOf(lastNotEmptyRow);

                //int lastNotEmptyRowsInDataIndexBeforeEmpty =
                //    result
                //    .Where(row => !row.IsEmpty)
                //    .Select(row => result.IndexOf(row))
                //    .Where(i => i < lastEmptyRowsInDataIndex)
                //    .OrderByDescending(i => i)
                //    .FirstOrDefault();

                if (lastEmptyRowsInDataIndex > 4 && result.Count - lastEmptyRowsInDataIndex < 15)
                {
                    List<ExelRow> similarityIndexes = new List<ExelRow>();
                    for (int i = lastNotEmptyRowsInDataIndexBeforeEmpty; i >= 0 && i > lastNotEmptyRowsInDataIndexBeforeEmpty - 10; i--)
                        similarityIndexes.Add(result[i]);

                    for (int z = result.Count - 1; z >= lastEmptyRowsInDataIndex; z--)
                    {
                        if (result[z].NotEmptyCells.Count() <= 4 && !similarityIndexes.Select(s => s.Similarity(result[z])).Any(d => d > 0.6))
                            result.RemoveAt(z);
                        else
                            break;
                    }
                }

                pp1.Value = 25;

                var isRowsSimilar = new Func<ExelRow, ExelRow, bool>((r1, r2) =>
                {
                    var cnt = Math.Min(r1.Cells.Count, r2.Cells.Count);
                    for (int i = 0; i < cnt; i++)
                        if (string.Compare(r1.Cells[i].Value.Trim(), r2.Cells[i].Value.Trim(), true) != 0)
                            return false;
                    return true;
                });

                var defStart = 4;
                var minCountForDeleteSimilarity = Properties.Settings.Default.MaxRowsInGroupCountToDeleteSimilarityRows;
                for (int i=result.Count-1; i>= defStart; i--)
                {
                    var n = 1;
                    if ((i - n >= defStart) && result[i].Similarity(result[i - n]) >= 0.8 && isRowsSimilar(result[i], result[i - n]))
                    {
                        var rowIndexes = new List<int>(new int[] { i - n });
                        do
                        {
                            n++;
                            rowIndexes.Add(i - n);
                        } while ((i - n >= defStart) && result[i].Similarity(result[i - n]) >= 0.8 && isRowsSimilar(result[i], result[i - n]));

                        if (n + 1 >= minCountForDeleteSimilarity)
                        {
                            rowIndexes
                                .OrderByDescending(ind => ind)
                                .ToList()
                                .ForEach(ind => result.RemoveAt(ind));
                            i -= n;
                            continue;
                            //result.RemoveAt(i);
                        }
                    }
                }

                pp1.Value = 50;

                #endregion

                //Delete all empty rows from data
                if (deleteEmptyRows)
                    for (int z = result.Count - 1; z >= 0; z--)
                    {
                        var r1 = result[z];
                        if (r1.IsEmpty)
                            result.RemoveAt(z);
                    }

                pp1.Value = 100;
            }
            return result.ToArray();
        }