public void ProcessTable(string fileName, string sheetName, MSheet mSheet) { Dictionary <int, List <bool> > feaDict = this.feaRow.GenerateSingularFeatureCRF(mSheet); string outPath = Config.CRFTMPFEATURE; StreamWriter fout = new StreamWriter(outPath); List <int> keySetFromFeaDict = new List <int>(feaDict.Keys); keySetFromFeaDict.Sort(); foreach (int row in keySetFromFeaDict) { List <bool> feaVec = feaDict[row]; int a = row - 1; fout.Write(fileName + "____" + sheetName.Replace(" ", "__") + "____" + a + " "); foreach (bool feature in feaVec) { if (feature) { fout.Write("1 "); } else { fout.Write("0 "); } } fout.WriteLine("Title"); } fout.Close(); }
public static void Run(string workbookName, string sheetName, MSheet mSheet, int counter = 0) { string predictPath = Config.CRFTMPPREDICT; string fileName = Path.GetFileName(predictPath); Console.WriteLine("Generating final output"); String outPath = Config.OUTPUTDIR + "/" + workbookName + "____" + sheetName + "____" + counter; StreamReader fin = new StreamReader(predictPath); StreamWriter fout = new StreamWriter(outPath); String line; while ((line = fin.ReadLine()) != null) { String[] strArr = Regex.Split(line.Trim(), @"\s+"); if (strArr.Length == 0) { continue; } String cKey = strArr[0]; if (cKey.Length == 0) { continue; } String label = strArr[strArr.Length - 1]; String[] strArr2 = Regex.Split(cKey.Trim(), @"____"); int row = int.Parse(strArr2[strArr2.Length - 1]); fout.WriteLine((row + 1).ToString() + "\t" + label); mSheet.Labels.Add(row + 1, DataTypes.String2RowLabel(label)); } fin.Close(); fout.Close(); Console.WriteLine("Successfully obtain prediction results"); }
public Dictionary <int, List <bool> > GenerateSingularFeatureCRF(MSheet mSheet) { Dictionary <int, List <bool> > feaDict = new Dictionary <int, List <bool> >(); for (int i = mSheet.StartRow; i < mSheet.StartRow + mSheet.RowNum; ++i) { Dictionary <int, MCell> rowCellDict = new Dictionary <int, MCell>(); for (int j = mSheet.StartCol; j < mSheet.StartCol + mSheet.ColNum; ++j) { Tuple <int, int> tuple = new Tuple <int, int>(i, j); if (mSheet.SheetDict.ContainsKey(tuple)) { MCell mCell = mSheet.SheetDict[tuple]; rowCellDict.Add(j, mCell); } } if (rowCellDict.Count() == 0) { continue; } bool blankFlag = false; if (feaDict.ContainsKey(i - 1)) { blankFlag = false; } else { blankFlag = true; } feaDict.Add(i, this.generateFeatureByRowCRF(i, rowCellDict, mSheet, blankFlag)); } return(feaDict); }
public IEnumerable <Tuple <string, string, MSheet> > ScanEachExcel() { int counter = 0; string[] files = Directory.GetFiles(Config.SHEETDIR); for (int i = 0; i < files.Length; ++i) { string fileName = Path.GetFileName(files[i]); if (!fileName.EndsWith("xlsx") || fileName.StartsWith("~$")) { continue; } /* * try * { * Console.WriteLine("Processing " + fileName); * SheetLoader sheetLoader = new SheetLoader(Config.SHEETDIR + "/" + fileName); * foreach(Tuple<string, MSheet> sheetDict in sheetLoader.FetchSheetDict()) * { * string sheetName = sheetDict.Item1; * MSheet mSheet = sheetDict.Item2; * this.ProcessTable(fileName, sheetName, mSheet); * yield return mSheet; * } ++counter; * if (counter % 100 == 0) * { * Console.WriteLine("Current: " + counter); * } * } * catch (Exception e) * { * Console.WriteLine("Error while processing " + fileName + " " + e.Message); * } */ Console.WriteLine("Processing " + fileName); SheetLoader sheetLoader = new SheetLoader(Config.SHEETDIR + "/" + fileName); foreach (Tuple <string, MSheet> sheetDict in sheetLoader.FetchSheetDict()) { string sheetName = sheetDict.Item1; MSheet mSheet = sheetDict.Item2; this.ProcessTable(fileName, sheetName, mSheet); yield return(new Tuple <string, string, MSheet>(fileName, sheetName, mSheet)); } sheetLoader.CloseWorkbook(); ++counter; if (counter % 100 == 0) { Console.WriteLine("Current: " + counter); } } }
private void linkToRoot(MSheet mSheet) { foreach (Tuple <int, int> key in mSheet.Nodes.Keys) { HeaderNode node = mSheet.Nodes[key]; if (!node.HasParent()) { mSheet.RootNode.AddChild(node); mSheet.Nodes[key].Parent = mSheet.RootNode; } } // this.printStructure(mSheet.RootNode, ""); }
/* * This method shows how to use the frame finder. */ public static void ProcessEachTable() { CRFRunner crfRunner = new CRFRunner(); crfRunner.Train(); PredictSheetRow predict = new PredictSheetRow(); foreach (Tuple <string, string, MSheet> eachTable in predict.ScanEachExcel()) { string workbookName = eachTable.Item1; string sheetName = eachTable.Item2; MSheet mSheet = eachTable.Item3; crfRunner.Predict(workbookName, sheetName); TransformOutput.Run(workbookName, sheetName, mSheet); // PrintLabel(mSheet.Labels); HorizontalHierarchyExtractor hhe = new HorizontalHierarchyExtractor(); hhe.ExtractFromMSheet(mSheet); } }
private void initNodes(MSheet mSheet, int rowNum, int colNum) { mSheet.InitNodes(); bool[,] vis = new bool[rowNum, colNum]; for (int i = 0; i < rowNum; ++i) { for (int j = 0; j < colNum; ++j) { vis[i, j] = false; } } for (int j = mSheet.StartCol; j < mSheet.StartCol + mSheet.ColNum; ++j) { for (int i = mSheet.StartRow; i < mSheet.StartRow + mSheet.RowNum; ++i) { if (vis[i - mSheet.StartRow, j - mSheet.StartCol]) { continue; } if (mSheet.Labels.Keys.Contains(i) && mSheet.Labels[i] == RowLabel.Header) { Range cell = mSheet.Cells.Cells[i, j]; int row = i; int col = j; if (cell.MergeCells) { row = cell.MergeArea.Row; col = cell.MergeArea.Column; for (int ii = 0; ii < cell.MergeArea.Rows.Count; ++ii) { for (int jj = 0; jj < cell.MergeArea.Columns.Count; ++jj) { vis[row + ii - mSheet.StartRow, col + jj - mSheet.StartCol] = true; } } } vis[row - mSheet.StartRow, col - mSheet.StartCol] = true; mSheet.Nodes.Add(new Tuple <int, int>(row, col), new HeaderNode(row, col)); } } } }
private List <bool> generateFeatureByRowCRF(int crow, Dictionary <int, MCell> rowCellDict, MSheet mSheet, bool blankFlag) { List <bool> feavec = new List <bool>(); String cLineText = ""; foreach (int ccol in rowCellDict.Keys) { MCell mCell = rowCellDict[ccol]; cLineText += mCell.Value + " "; } // layout feature feavec.Add(blankFlag); feavec.Add(this.featureHasMergedCell(crow, mSheet)); feavec.Add(this.featureReachRightBound(rowCellDict, mSheet.MaxColNum)); feavec.Add(this.featureReachLeftBound(rowCellDict)); feavec.Add(this.featureIsOneColumn(rowCellDict)); feavec.Add(this.featureHasCenterAlignCell(rowCellDict)); feavec.Add(this.featureHasLeftAlignCell(rowCellDict)); feavec.Add(this.featureHasBoldFontCell(rowCellDict)); feavec.Add(this.featureIndentation(cLineText)); // textual feature feavec.Add(this.featureStartWithTable(cLineText)); feavec.Add(this.featureStartWithPunctation(cLineText)); feavec.Add(this.featureNumberPercentHigh(rowCellDict)); feavec.Add(this.featureDigitalPercentHigh(rowCellDict)); feavec.Add(this.featureAlphabetaAllCapital(cLineText)); feavec.Add(this.featureAlphabetaStartWithCapital(rowCellDict)); feavec.Add(this.featureAlphabetaStartWithLowercase(rowCellDict)); feavec.Add(this.featureAlphabetaCellnumPercentHigh(rowCellDict)); feavec.Add(this.featureAlphabetaPercentHigh(cLineText)); feavec.Add(this.featureContainSpecialChar(cLineText)); feavec.Add(this.featureContainColon(cLineText)); // feavec.Add(this.featureYearRangeCellnumHigh(rowCellDict)); feavec.Add(this.featureYearRangePercentHigh(rowCellDict)); feavec.Add(this.featureWordLengthHigh(rowCellDict)); return(feavec); }
public void ExtractFromMSheet(MSheet mSheet) { int rowNum = mSheet.RowNum; int colNum = mSheet.ColNum; this.initNodes(mSheet, rowNum, colNum); bool[,] vis = new bool[rowNum, colNum]; for (int i = 0; i < rowNum; ++i) { for (int j = 0; j < colNum; ++j) { vis[i, j] = false; } } int upRow = mSheet.StartRow; int leftCol = mSheet.StartCol; int downRow = mSheet.StartRow + mSheet.RowNum - 1; int rightCol = mSheet.StartCol + mSheet.ColNum - 1; for (int j = leftCol; j <= rightCol; ++j) { for (int i = upRow; i <= downRow; ++i) { if (vis[i - upRow, j - leftCol]) { continue; } Range cell = mSheet.Cells.Cells[i, j]; int row = i; int col = j; if (cell.MergeCells) { row = cell.MergeArea.Row; col = cell.MergeArea.Column; for (int ii = 0; ii < cell.MergeArea.Rows.Count; ++ii) { for (int jj = 0; jj < cell.MergeArea.Columns.Count; ++jj) { vis[row + ii - mSheet.StartRow, col + jj - mSheet.StartCol] = true; } } } vis[i - upRow, j - leftCol] = true; Tuple <int, int> tmp = new Tuple <int, int>(row, col); if (mSheet.Nodes.Keys.Contains(tmp)) { int preRow = row - 1; int preCol = col; Range preCell = mSheet.Cells.Cells[preRow, preCol]; if (preCell.MergeCells) { preRow = preCell.MergeArea.Row; preCol = preCell.MergeArea.Column; } Tuple <int, int> preTmp = new Tuple <int, int>(preRow, preCol); if (mSheet.Nodes.Keys.Contains(preTmp)) { mSheet.Nodes[preTmp].AddChild(mSheet.Nodes[tmp]); mSheet.Nodes[tmp].Parent = mSheet.Nodes[preTmp]; } } } } this.linkToRoot(mSheet); }
public IEnumerable <Tuple <string, MSheet> > FetchSheetDict() { Sheets sheets = this.workbook.Sheets; foreach (Worksheet sheet in sheets) { string sheetName = sheet.Name; List <Tuple <int, int, int, int> > ranges = TableHelper.SplitTable(sheet); foreach (Tuple <int, int, int, int> range in ranges) { MSheet mSheet = new MSheet(); int upRow = range.Item1; int leftCol = range.Item2; int downRow = range.Item3; int rightCol = range.Item4; int rowNum = downRow - upRow + 1; int colNum = rightCol - leftCol + 1; mSheet.StartRow = upRow; mSheet.StartCol = leftCol; mSheet.RowNum = rowNum; mSheet.ColNum = colNum; { Range beginCell = sheet.Cells[upRow, leftCol]; Range endCell = sheet.Cells[downRow, rightCol]; string beginAddress = beginCell.get_Address().Replace("$", ""); string endAddress = endCell.get_Address().Replace("$", ""); mSheet.Cells = sheet.get_Range(beginAddress, endAddress); } bool[,] vis = new bool[rowNum, colNum]; DataType[,] typeTable = new DataType[rowNum, colNum]; for (int i = 0; i < rowNum; ++i) { for (int j = 0; j < colNum; ++j) { vis[i, j] = false; typeTable[i, j] = DataType.NONE; } } for (int row = upRow; row <= downRow; ++row) { for (int col = leftCol; col <= rightCol; ++col) { if (vis[row - upRow, col - leftCol]) { continue; } Range cell = sheet.Cells[row, col]; if (cell.MergeCells) { int rowCount = cell.MergeArea.Rows.Count; int colCount = cell.MergeArea.Columns.Count; mSheet.AddMergeCell(row, row + rowCount - 1, col, col + colCount - 1); for (int i = row; i < row + rowCount; ++i) { for (int j = col; j < col + colCount; ++j) { vis[i - upRow, j - leftCol] = true; } } } string cellValue = Convert.ToString(cell.Value2); string cellType = (cell.NumberFormat as string); if (cellValue == null || cellValue.Length == 0) { continue; } string cType = this.getValueType(cellValue); int indents = cell.IndentLevel; /* XlHAlign * -4131 = xlHAlignLeft -> ALIGN_LEFT = 0x1 * -4152 = xlHAlignRight -> ALIGN_RIGHT = 0x3 * -4108 = xlHAlignCenter -> ALIGN_CENTER = 0x2 * -4130 = xlHAlignJustify -> ALIGN_JUSTIFY = 0x5 * -4117 = xlHAlignDistributed -> * 1 = xlHAlignGeneral -> ALIGN_GENERAL = 0x0 * 5 = xlHAlignFill -> ALIGN_FILL = 0x4 * 7 = xlHAlignCenterAcrossSelection -> */ // int alignStyle = cell.HorizontalAlignment; // int alignStyle = this.getFeatureAlignStyle(cell.HorizontalAlignment); int alignStyle = cell.HorizontalAlignment; /* XlLineStyle * -4142 = xlLineStyleNone -> BORDER_NONE = 0x0 * -4119 = xlDouble -> BORDER_DOUBLE = 0x6 * -4118 = xlDot -> BORDER_HAIR = 0x7 * -4115 = xlDash -> BORDER_DASHED = 0x3 * 1 = xlContinuous * 4 = xlDashDot -> BORDER_DASH_DOT = 0x9 * 5 = xlDashDotDot -> BORDER_DASH_DOT_DOT = 0xB * 13 = xlSlantDashDot -> BORDER_SLANTED_DASH_DOT = 0xD */ string borderStyle = this.getFeatureBorderStyle(cell.Borders); /* XlColorIndex * -4142 = xlColorIndexNone * -4105 = xlColorIndexAutomatic */ double bgColor = cell.Interior.ColorIndex; int boldFlag = this.getFeatureFontBold(cell.Font); double height = this.getFeatureFontHeight(cell.Font) * 20.0; int italicFlag = this.getFeatureFontItalic(cell.Font); // XlUnderlineStyle int underlineFlag = this.getFeatureFontUnderline(cell.Font); DataType dataType = this.getDataType(cell.NumberFormat as string, cellValue); mSheet.InsertCell(row, col, cType, indents, alignStyle, borderStyle, (int)bgColor, boldFlag, (int)height, italicFlag, underlineFlag, cellValue); typeTable[row - upRow, col - leftCol] = dataType; } } DataType[] columnTypes = new DataType[colNum]; this.findColumnType(typeTable, rowNum, colNum, columnTypes); mSheet.SetColumnTypeTable(columnTypes, colNum); yield return(new Tuple <string, MSheet>(sheetName, mSheet)); } } }
// This method is out of date and should not be used! private Dictionary <string, MSheet> LoadSheetDictByTransposition() { Dictionary <string, MSheet> sheetDict = new Dictionary <string, MSheet>(); Sheets sheets = this.workbook.Sheets; foreach (Worksheet sheet in sheets) { // List<Tuple<int, int, int, int>> ranges = TableHelper.SplitTable(sheet); string sheetName = sheet.Name; MSheet mSheet = new MSheet(); int rowNum = sheet.UsedRange.Cells.Rows.Count; int colNum = sheet.UsedRange.Cells.Columns.Count; int stRow = sheet.UsedRange.Row; int stCol = sheet.UsedRange.Column; mSheet.StartRow = stRow; mSheet.StartCol = stCol; mSheet.RowNum = rowNum; mSheet.ColNum = colNum; bool[,] vis = new bool[rowNum, colNum]; for (int i = 0; i < rowNum; ++i) { for (int j = 0; j < colNum; ++j) { vis[i, j] = false; } } for (int row = stRow; row < stRow + rowNum; ++row) { for (int col = stCol; col < stCol + colNum; ++col) { Range cell = sheet.Cells[row, col]; if (vis[row - stRow, col - stCol]) { continue; } if (cell.MergeCells) { int rowCount = cell.MergeArea.Rows.Count; int colCount = cell.MergeArea.Columns.Count; mSheet.AddMergeCell(col, col + colCount - 1, row, row + rowCount - 1); for (int i = row; i < row + rowCount; ++i) { for (int j = col; j < col + colCount; ++j) { vis[i - stRow, j - stCol] = true; } } } string cellValue = Convert.ToString(cell.Value2); string cellType = (cell.NumberFormat as string); if (cellValue == null || cellValue.Length == 0) { continue; } /* * 0.00 means precision is 2 * #,## means to use , delimiter */ string cType = this.getValueType(cellValue); string cStr = cellValue; int indents = cell.IndentLevel; /* XlHAlign * -4131 = xlHAlignLeft -> ALIGN_LEFT = 0x1 * -4152 = xlHAlignRight -> ALIGN_RIGHT = 0x3 * -4108 = xlHAlignCenter -> ALIGN_CENTER = 0x2 * -4130 = xlHAlignJustify -> ALIGN_JUSTIFY = 0x5 * -4117 = xlHAlignDistributed -> * 1 = xlHAlignGeneral -> ALIGN_GENERAL = 0x0 * 5 = xlHAlignFill -> ALIGN_FILL = 0x4 * 7 = xlHAlignCenterAcrossSelection -> */ // int alignStyle = cell.HorizontalAlignment; // int alignStyle = this.getFeatureAlignStyle(cell.HorizontalAlignment); int alignStyle = cell.HorizontalAlignment; /* XlLineStyle * -4142 = xlLineStyleNone -> BORDER_NONE = 0x0 * -4119 = xlDouble -> BORDER_DOUBLE = 0x6 * -4118 = xlDot -> BORDER_HAIR = 0x7 * -4115 = xlDash -> BORDER_DASHED = 0x3 * 1 = xlContinuous * 4 = xlDashDot -> BORDER_DASH_DOT = 0x9 * 5 = xlDashDotDot -> BORDER_DASH_DOT_DOT = 0xB * 13 = xlSlantDashDot -> BORDER_SLANTED_DASH_DOT = 0xD */ string borderStyle = this.getFeatureBorderStyle(cell.Borders); /* XlColorIndex * -4142 = xlColorIndexNone * -4105 = xlColorIndexAutomatic */ double bgColor = cell.Interior.ColorIndex; int boldFlag = this.getFeatureFontBold(cell.Font); double height = this.getFeatureFontHeight(cell.Font) * 20.0; int italicFlag = this.getFeatureFontItalic(cell.Font); // XlUnderlineStyle int underlineFlag = this.getFeatureFontUnderline(cell.Font); mSheet.InsertCell(col, row, cType, indents, alignStyle, borderStyle, (int)bgColor, boldFlag, (int)height, italicFlag, underlineFlag, cStr); } } sheetDict.Add(sheetName, mSheet); } return(sheetDict); }
private bool featureHasMergedCell(int crow, MSheet mSheet) { return(mSheet.MergeRowSet.Contains(crow)); }