public double Similarity(ExelRow exelRow, int maxColumnsCount = 0) { double result = 0.0; bool wasError = false; var logSession = Helpers.Old.Log.SessionStart("ExelRow.Similarity()", true); try { if (maxColumnsCount == 0) { maxColumnsCount = int.MaxValue; } //if (exelRow.Cells.Count == this.Cells.Count) //{ int maxValue = Math.Min(Math.Min(exelRow.Cells.Count, this.Cells.Count), maxColumnsCount); if (maxValue > 0) { for (int i = 0; i < maxValue; i++) { result += exelRow.Cells[i].IsMerged == this.Cells[i].IsMerged ? 0.09 : 0; result += string.IsNullOrEmpty(exelRow.Cells[i].Value) == string.IsNullOrEmpty(this.Cells[i].Value) ? 0.44 : 0.0; result += Math.Abs(exelRow.Cells[i].UniqueWeight - this.Cells[i].UniqueWeight) < 0.07 ? 0.15 : 0.0; result += AsyncDocumentLoader.ColorsEqual(exelRow.Cells[i].CellStyle.ForegroundColor, this.Cells[i].CellStyle.ForegroundColor) ? 0.16 : 0.0; result += AsyncDocumentLoader.ColorsEqual(exelRow.Cells[i].CellStyle.BackgroundColor, this.Cells[i].CellStyle.BackgroundColor) ? 0.16 : 0.0; } } result = result / maxValue; //} if (maxColumnsCount == int.MaxValue) { result = result * 0.55; result += ((double)Math.Min(this.NotEmptyCells.Count(), exelRow.NotEmptyCells.Count()) / (double)Math.Max(this.NotEmptyCells.Count(), exelRow.NotEmptyCells.Count())) * 0.4; //result += (this.NotEmptyCells.Count() == exelRow.NotEmptyCells.Count()) ? 0.4 : 0; result += (this.UniqueNotEmptyCells.Count() == exelRow.UniqueNotEmptyCells.Count()) ? 0.05 : 0; } } catch (Exception ex) { wasError = true; Helpers.Old.Log.Add(logSession, ex); throw ex; } finally { Helpers.Old.Log.SessionEnd(logSession, wasError); } return(result); }
public double Similarity(ExelRow exelRow, int maxColumnsCount = 0) { double result = 0.0; bool wasError = false; var logSession = Helpers.Old.Log.SessionStart("ExelRow.Similarity()", true); try { if (maxColumnsCount == 0) maxColumnsCount = int.MaxValue; //if (exelRow.Cells.Count == this.Cells.Count) //{ int maxValue = Math.Min(Math.Min(exelRow.Cells.Count, this.Cells.Count), maxColumnsCount); if (maxValue > 0) for (int i = 0; i < maxValue; i++) { result += exelRow.Cells[i].IsMerged == this.Cells[i].IsMerged ? 0.09 : 0; result += string.IsNullOrEmpty(exelRow.Cells[i].Value) == string.IsNullOrEmpty(this.Cells[i].Value) ? 0.44 : 0.0; result += Math.Abs(exelRow.Cells[i].UniqueWeight - this.Cells[i].UniqueWeight) < 0.07 ? 0.15 : 0.0; result += AsyncDocumentLoader.ColorsEqual(exelRow.Cells[i].CellStyle.ForegroundColor,this.Cells[i].CellStyle.ForegroundColor) ? 0.16 : 0.0; result += AsyncDocumentLoader.ColorsEqual(exelRow.Cells[i].CellStyle.BackgroundColor, this.Cells[i].CellStyle.BackgroundColor) ? 0.16 : 0.0; } result = result / maxValue; //} if (maxColumnsCount == int.MaxValue) { result = result * 0.55; result += ((double)Math.Min(this.NotEmptyCells.Count(), exelRow.NotEmptyCells.Count()) / (double)Math.Max(this.NotEmptyCells.Count(), exelRow.NotEmptyCells.Count())) * 0.4; //result += (this.NotEmptyCells.Count() == exelRow.NotEmptyCells.Count()) ? 0.4 : 0; result += (this.UniqueNotEmptyCells.Count() == exelRow.UniqueNotEmptyCells.Count()) ? 0.05 : 0; } } catch(Exception ex) { wasError = true; Helpers.Old.Log.Add(logSession, ex); throw ex; } finally { Helpers.Old.Log.SessionEnd(logSession, wasError); } return result; }
public static ExelRow[] LoadRows(Worksheet sheet, int count = 0, Action <int> progressReport = null, bool deleteEmptyRows = true) { var result = new List <ExelRow>(); var totalRowsCount = 0; var loaded = 0; var pp = new Helpers.PercentageProgress(); if (progressReport != null) { pp.Change += (s, e) => { progressReport((int)e.Value); } } ; var pp0 = pp.GetChild(weight: 9); var pp1 = pp.GetChild(weight: 1); try { totalRowsCount = Math.Max(sheet.Cells.MaxRow + 1, sheet.Cells.Rows.Count); } catch { totalRowsCount = sheet.Cells.Rows.Count; } if (count <= 0) { count = totalRowsCount; } #region Select max column index for sheet int maxColumnIndex = sheet.Cells.Rows.Cast <Row>().Take(count).Select(c => { if (c.LastCell != null) { for (int i = c.LastCell.Column; i >= 0; i--) { var cell = c.GetCellOrNull(i); if (!string.IsNullOrWhiteSpace( cell == null || cell.Value == null ? null : cell.Value.ToString().Trim() ) ) { return(cell.IsMerged ? cell.GetMergedRange().FirstColumn + cell.GetMergedRange().ColumnCount - 1 : i); } } } return(0); } ) .Union(new int[] { 0 }) .Max(); #endregion #region Read rows if (sheet.Cells.Count > 0) { var lockObj = new Object(); var lockCell = new Object(); sheet.Cells.MultiThreadReading = true; var items = sheet.Cells.Rows.Cast <Row>() .Where(r => r.Index < count) .AsParallel() .Select(row => { Thread.CurrentThread.CurrentCulture = CultureInfo.GetCultureInfo("ru-RU"); Thread.CurrentThread.CurrentUICulture = Thread.CurrentThread.CurrentCulture; var r = new ExelRow() { Index = row.Index }; var currMaxColumnsIndex = row.LastCell == null ? 0 : row.LastCell.Column; int lastFilledColumnIndex = Math.Min(currMaxColumnsIndex, maxColumnIndex); #region Read data from cells var cells = Enumerable.Range(0, lastFilledColumnIndex + 1) .Select(cellIndex => new { OriginalCell = row.GetCellOrNull(cellIndex), Index = cellIndex }) .Select(c => new { OriginalCell = c.OriginalCell, ResultCell = new ExelCell() { Value = string.Empty, FormatedValue = string.Empty, CellStyle = new Style(), OriginalIndex = c.Index } }) .Select(i => { if (i.OriginalCell != null) { lock (lockCell) i.ResultCell.FormatedValue = i.OriginalCell.StringValue; var link = GetHyperlinkForCell(i.OriginalCell, sheet); if (link != null) { i.ResultCell.HyperLink = link.Address; } else { var formula = string.Empty; lock (lockCell) formula = i.OriginalCell.Formula; if (!string.IsNullOrWhiteSpace(formula)) { i.ResultCell.HyperLink = formula.Split(new char[] { '\"' }).FirstOrDefault(str => str.Contains("http")); } } if (i.OriginalCell.IsMerged) { i.ResultCell.IsMerged = true; var content = string.Empty; var values = (IEnumerable)i.OriginalCell.GetMergedRange().Value; if (values != null) { foreach (var value in values) { content += value; } } i.ResultCell.Value = content; } else if (i.OriginalCell.Value != null) { i.ResultCell.Value = i.OriginalCell.Value.ToString(); } else { i.ResultCell.Value = string.Empty; } try { var comment = sheet.Comments[i.OriginalCell.Row, i.ResultCell.OriginalIndex]; if (comment != null) { i.ResultCell.Comment = comment.Note; } } catch { } var style = i.OriginalCell.GetStyle(); i.ResultCell.CellStyle = style; i.ResultCell.Color = (style != null) ? (DefColors.Any(clr => ColorsEqual(clr, style.BackgroundColor)) ? style.ForegroundColor : style.BackgroundColor) : System.Drawing.Color.White; i.ResultCell.Color = System.Drawing.Color.FromArgb((i.ResultCell.Color.R > byte.MinValue || i.ResultCell.Color.G > byte.MinValue || i.ResultCell.Color.B > byte.MinValue) && i.ResultCell.Color.A == byte.MinValue ? byte.MaxValue : i.ResultCell.Color.A, i.ResultCell.Color.R, i.ResultCell.Color.G, i.ResultCell.Color.B); } return(new { ResultCell = i.ResultCell, Index = i.ResultCell.OriginalIndex }); }) .OrderBy(i => i.Index) .Select(i => i.ResultCell) .ToArray(); r.Cells.AddRange(cells); #endregion var lastStyle = (maxColumnIndex > 0) ? r.Cells[lastFilledColumnIndex].CellStyle : new Style(); for (int i = r.Cells.Count; i <= maxColumnIndex; i++) { r.Cells.Add(new ExelCell() { Value = string.Empty, CellStyle = lastStyle }); } lock (lockObj) { loaded++; pp0.Value = (decimal)loaded * 100m / (decimal)count; } return(r); }) .OrderBy(r => r.Index) .Where(r => !deleteEmptyRows || !r.IsEmpty) // delete empty rows .ToArray(); result.AddRange(items); } #endregion if (result.Count > 0) { #region Delete bottom //#### Try to delete bottom info #### //get last empty index var lastEmptyRow = result.LastOrDefault(row => row.IsEmpty); int lastEmptyRowsInDataIndex = lastEmptyRow == null ? 0 : result.IndexOf(lastEmptyRow); //int lastEmptyRowsInDataIndex = // result // .Where(row => row.IsEmpty) // .Select(row => result.IndexOf(row)) // .OrderByDescending(i => i) // .FirstOrDefault(); //get last non empty index before last empty to check similarity var lastNotEmptyRow = result.Take(lastEmptyRowsInDataIndex == 0 ? result.Count : lastEmptyRowsInDataIndex).LastOrDefault(row => !row.IsEmpty); int lastNotEmptyRowsInDataIndexBeforeEmpty = lastNotEmptyRow == null ? 0 : result.IndexOf(lastNotEmptyRow); //int lastNotEmptyRowsInDataIndexBeforeEmpty = // result // .Where(row => !row.IsEmpty) // .Select(row => result.IndexOf(row)) // .Where(i => i < lastEmptyRowsInDataIndex) // .OrderByDescending(i => i) // .FirstOrDefault(); if (lastEmptyRowsInDataIndex > 4 && result.Count - lastEmptyRowsInDataIndex < 15) { List <ExelRow> similarityIndexes = new List <ExelRow>(); for (int i = lastNotEmptyRowsInDataIndexBeforeEmpty; i >= 0 && i > lastNotEmptyRowsInDataIndexBeforeEmpty - 10; i--) { similarityIndexes.Add(result[i]); } for (int z = result.Count - 1; z >= lastEmptyRowsInDataIndex; z--) { if (result[z].NotEmptyCells.Count() <= 4 && !similarityIndexes.Select(s => s.Similarity(result[z])).Any(d => d > 0.6)) { result.RemoveAt(z); } else { break; } } } pp1.Value = 25; var isRowsSimilar = new Func <ExelRow, ExelRow, bool>((r1, r2) => { var cnt = Math.Min(r1.Cells.Count, r2.Cells.Count); for (int i = 0; i < cnt; i++) { if (string.Compare(r1.Cells[i].Value.Trim(), r2.Cells[i].Value.Trim(), true) != 0) { return(false); } } return(true); }); var defStart = 4; var minCountForDeleteSimilarity = Properties.Settings.Default.MaxRowsInGroupCountToDeleteSimilarityRows; for (int i = result.Count - 1; i >= defStart; i--) { var n = 1; if ((i - n >= defStart) && result[i].Similarity(result[i - n]) >= 0.8 && isRowsSimilar(result[i], result[i - n])) { var rowIndexes = new List <int>(new int[] { i - n }); do { n++; rowIndexes.Add(i - n); } while ((i - n >= defStart) && result[i].Similarity(result[i - n]) >= 0.8 && isRowsSimilar(result[i], result[i - n])); if (n + 1 >= minCountForDeleteSimilarity) { rowIndexes .OrderByDescending(ind => ind) .ToList() .ForEach(ind => result.RemoveAt(ind)); i -= n; continue; //result.RemoveAt(i); } } } pp1.Value = 50; #endregion //Delete all empty rows from data if (deleteEmptyRows) { for (int z = result.Count - 1; z >= 0; z--) { var r1 = result[z]; if (r1.IsEmpty) { result.RemoveAt(z); } } } pp1.Value = 100; } return(result.ToArray()); }
public static ExelRow[] LoadRows(Worksheet sheet, int count = 0, Action<int> progressReport = null, bool deleteEmptyRows = true) { var result = new List<ExelRow>(); var totalRowsCount = 0; var loaded = 0; var pp = new Helpers.PercentageProgress(); if (progressReport != null) pp.Change += (s, e) => { progressReport((int)e.Value); }; var pp0 = pp.GetChild(weight: 9); var pp1 = pp.GetChild(weight: 1); try { totalRowsCount = Math.Max(sheet.Cells.MaxRow + 1, sheet.Cells.Rows.Count); } catch { totalRowsCount = sheet.Cells.Rows.Count; } if (count <= 0) count = totalRowsCount; #region Select max column index for sheet int maxColumnIndex = sheet.Cells.Rows.Cast<Row>().Take(count).Select(c => { if (c.LastCell != null) for (int i = c.LastCell.Column; i >= 0; i--) { var cell = c.GetCellOrNull(i); if (!string.IsNullOrWhiteSpace( cell == null || cell.Value == null ? null : cell.Value.ToString().Trim() ) ) return cell.IsMerged ? cell.GetMergedRange().FirstColumn + cell.GetMergedRange().ColumnCount - 1 : i; } return 0; } ) .Union(new int[] { 0 }) .Max(); #endregion #region Read rows if (sheet.Cells.Count > 0) { var lockObj = new Object(); var lockCell = new Object(); sheet.Cells.MultiThreadReading = true; var items = sheet.Cells.Rows.Cast<Row>() .Where(r => r.Index < count) .AsParallel() .Select(row => { Thread.CurrentThread.CurrentCulture = CultureInfo.GetCultureInfo("ru-RU"); Thread.CurrentThread.CurrentUICulture = Thread.CurrentThread.CurrentCulture; var r = new ExelRow() { Index = row.Index }; var currMaxColumnsIndex = row.LastCell == null ? 0 : row.LastCell.Column; int lastFilledColumnIndex = Math.Min(currMaxColumnsIndex, maxColumnIndex); #region Read data from cells var cells = Enumerable.Range(0, lastFilledColumnIndex + 1) .Select(cellIndex => new { OriginalCell = row.GetCellOrNull(cellIndex), Index = cellIndex }) .Select(c => new { OriginalCell = c.OriginalCell, ResultCell = new ExelCell() { Value = string.Empty, FormatedValue = string.Empty, CellStyle = new Style(), OriginalIndex = c.Index } }) .Select(i => { if (i.OriginalCell != null) { lock(lockCell) i.ResultCell.FormatedValue = i.OriginalCell.StringValue; var link = GetHyperlinkForCell(i.OriginalCell, sheet); if (link != null) i.ResultCell.HyperLink = link.Address; else { var formula = string.Empty; lock(lockCell) formula = i.OriginalCell.Formula; if (!string.IsNullOrWhiteSpace(formula)) i.ResultCell.HyperLink = formula.Split(new char[] { '\"' }).FirstOrDefault(str => str.Contains("http")); } if (i.OriginalCell.IsMerged) { i.ResultCell.IsMerged = true; var content = string.Empty; var values = (IEnumerable)i.OriginalCell.GetMergedRange().Value; if (values != null) foreach (var value in values) content += value; i.ResultCell.Value = content; } else if (i.OriginalCell.Value != null) { i.ResultCell.Value = i.OriginalCell.Value.ToString(); } else { i.ResultCell.Value = string.Empty; } try { var comment = sheet.Comments[i.OriginalCell.Row, i.ResultCell.OriginalIndex]; if (comment != null) i.ResultCell.Comment = comment.Note; } catch { } var style = i.OriginalCell.GetStyle(); i.ResultCell.CellStyle = style; i.ResultCell.Color = (style != null) ? (DefColors.Any(clr => ColorsEqual(clr, style.BackgroundColor)) ? style.ForegroundColor : style.BackgroundColor) : System.Drawing.Color.White; i.ResultCell.Color = System.Drawing.Color.FromArgb((i.ResultCell.Color.R > byte.MinValue || i.ResultCell.Color.G > byte.MinValue || i.ResultCell.Color.B > byte.MinValue) && i.ResultCell.Color.A == byte.MinValue ? byte.MaxValue : i.ResultCell.Color.A, i.ResultCell.Color.R, i.ResultCell.Color.G, i.ResultCell.Color.B); } return new { ResultCell = i.ResultCell, Index = i.ResultCell.OriginalIndex }; }) .OrderBy(i => i.Index) .Select(i => i.ResultCell) .ToArray(); r.Cells.AddRange(cells); #endregion var lastStyle = (maxColumnIndex > 0) ? r.Cells[lastFilledColumnIndex].CellStyle : new Style(); for (int i = r.Cells.Count; i <= maxColumnIndex; i++) r.Cells.Add(new ExelCell() { Value = string.Empty, CellStyle = lastStyle }); lock (lockObj) { loaded++; pp0.Value = (decimal)loaded * 100m / (decimal)count; } return r; }) .OrderBy(r => r.Index) .Where(r => !deleteEmptyRows || !r.IsEmpty) // delete empty rows .ToArray(); result.AddRange(items); } #endregion if (result.Count > 0) { #region Delete bottom //#### Try to delete bottom info #### //get last empty index var lastEmptyRow = result.LastOrDefault(row => row.IsEmpty); int lastEmptyRowsInDataIndex = lastEmptyRow == null ? 0 : result.IndexOf(lastEmptyRow); //int lastEmptyRowsInDataIndex = // result // .Where(row => row.IsEmpty) // .Select(row => result.IndexOf(row)) // .OrderByDescending(i => i) // .FirstOrDefault(); //get last non empty index before last empty to check similarity var lastNotEmptyRow = result.Take(lastEmptyRowsInDataIndex == 0 ? result.Count : lastEmptyRowsInDataIndex).LastOrDefault(row => !row.IsEmpty); int lastNotEmptyRowsInDataIndexBeforeEmpty = lastNotEmptyRow == null ? 0 : result.IndexOf(lastNotEmptyRow); //int lastNotEmptyRowsInDataIndexBeforeEmpty = // result // .Where(row => !row.IsEmpty) // .Select(row => result.IndexOf(row)) // .Where(i => i < lastEmptyRowsInDataIndex) // .OrderByDescending(i => i) // .FirstOrDefault(); if (lastEmptyRowsInDataIndex > 4 && result.Count - lastEmptyRowsInDataIndex < 15) { List<ExelRow> similarityIndexes = new List<ExelRow>(); for (int i = lastNotEmptyRowsInDataIndexBeforeEmpty; i >= 0 && i > lastNotEmptyRowsInDataIndexBeforeEmpty - 10; i--) similarityIndexes.Add(result[i]); for (int z = result.Count - 1; z >= lastEmptyRowsInDataIndex; z--) { if (result[z].NotEmptyCells.Count() <= 4 && !similarityIndexes.Select(s => s.Similarity(result[z])).Any(d => d > 0.6)) result.RemoveAt(z); else break; } } pp1.Value = 25; var isRowsSimilar = new Func<ExelRow, ExelRow, bool>((r1, r2) => { var cnt = Math.Min(r1.Cells.Count, r2.Cells.Count); for (int i = 0; i < cnt; i++) if (string.Compare(r1.Cells[i].Value.Trim(), r2.Cells[i].Value.Trim(), true) != 0) return false; return true; }); var defStart = 4; var minCountForDeleteSimilarity = Properties.Settings.Default.MaxRowsInGroupCountToDeleteSimilarityRows; for (int i=result.Count-1; i>= defStart; i--) { var n = 1; if ((i - n >= defStart) && result[i].Similarity(result[i - n]) >= 0.8 && isRowsSimilar(result[i], result[i - n])) { var rowIndexes = new List<int>(new int[] { i - n }); do { n++; rowIndexes.Add(i - n); } while ((i - n >= defStart) && result[i].Similarity(result[i - n]) >= 0.8 && isRowsSimilar(result[i], result[i - n])); if (n + 1 >= minCountForDeleteSimilarity) { rowIndexes .OrderByDescending(ind => ind) .ToList() .ForEach(ind => result.RemoveAt(ind)); i -= n; continue; //result.RemoveAt(i); } } } pp1.Value = 50; #endregion //Delete all empty rows from data if (deleteEmptyRows) for (int z = result.Count - 1; z >= 0; z--) { var r1 = result[z]; if (r1.IsEmpty) result.RemoveAt(z); } pp1.Value = 100; } return result.ToArray(); }