Exemplo n.º 1
0
        public void ProcessTable(string fileName, string sheetName, MSheet mSheet)
        {
            Dictionary <int, List <bool> > feaDict = this.feaRow.GenerateSingularFeatureCRF(mSheet);
            string       outPath           = Config.CRFTMPFEATURE;
            StreamWriter fout              = new StreamWriter(outPath);
            List <int>   keySetFromFeaDict = new List <int>(feaDict.Keys);

            keySetFromFeaDict.Sort();
            foreach (int row in keySetFromFeaDict)
            {
                List <bool> feaVec = feaDict[row];
                int         a      = row - 1;
                fout.Write(fileName + "____" + sheetName.Replace(" ", "__") + "____" + a + " ");
                foreach (bool feature in feaVec)
                {
                    if (feature)
                    {
                        fout.Write("1 ");
                    }
                    else
                    {
                        fout.Write("0 ");
                    }
                }
                fout.WriteLine("Title");
            }
            fout.Close();
        }
Exemplo n.º 2
0
        public static void Run(string workbookName, string sheetName, MSheet mSheet, int counter = 0)
        {
            string predictPath = Config.CRFTMPPREDICT;
            string fileName    = Path.GetFileName(predictPath);

            Console.WriteLine("Generating final output");
            String       outPath = Config.OUTPUTDIR + "/" + workbookName + "____" + sheetName + "____" + counter;
            StreamReader fin     = new StreamReader(predictPath);
            StreamWriter fout    = new StreamWriter(outPath);
            String       line;

            while ((line = fin.ReadLine()) != null)
            {
                String[] strArr = Regex.Split(line.Trim(), @"\s+");
                if (strArr.Length == 0)
                {
                    continue;
                }
                String cKey = strArr[0];
                if (cKey.Length == 0)
                {
                    continue;
                }
                String   label   = strArr[strArr.Length - 1];
                String[] strArr2 = Regex.Split(cKey.Trim(), @"____");
                int      row     = int.Parse(strArr2[strArr2.Length - 1]);
                fout.WriteLine((row + 1).ToString() + "\t" + label);
                mSheet.Labels.Add(row + 1, DataTypes.String2RowLabel(label));
            }
            fin.Close();
            fout.Close();
            Console.WriteLine("Successfully obtain prediction results");
        }
Exemplo n.º 3
0
        public Dictionary <int, List <bool> > GenerateSingularFeatureCRF(MSheet mSheet)
        {
            Dictionary <int, List <bool> > feaDict = new Dictionary <int, List <bool> >();

            for (int i = mSheet.StartRow; i < mSheet.StartRow + mSheet.RowNum; ++i)
            {
                Dictionary <int, MCell> rowCellDict = new Dictionary <int, MCell>();
                for (int j = mSheet.StartCol; j < mSheet.StartCol + mSheet.ColNum; ++j)
                {
                    Tuple <int, int> tuple = new Tuple <int, int>(i, j);
                    if (mSheet.SheetDict.ContainsKey(tuple))
                    {
                        MCell mCell = mSheet.SheetDict[tuple];
                        rowCellDict.Add(j, mCell);
                    }
                }
                if (rowCellDict.Count() == 0)
                {
                    continue;
                }
                bool blankFlag = false;
                if (feaDict.ContainsKey(i - 1))
                {
                    blankFlag = false;
                }
                else
                {
                    blankFlag = true;
                }
                feaDict.Add(i, this.generateFeatureByRowCRF(i, rowCellDict, mSheet, blankFlag));
            }
            return(feaDict);
        }
Exemplo n.º 4
0
        public IEnumerable <Tuple <string, string, MSheet> > ScanEachExcel()
        {
            int counter = 0;

            string[] files = Directory.GetFiles(Config.SHEETDIR);
            for (int i = 0; i < files.Length; ++i)
            {
                string fileName = Path.GetFileName(files[i]);
                if (!fileName.EndsWith("xlsx") || fileName.StartsWith("~$"))
                {
                    continue;
                }

                /*
                 * try
                 * {
                 *  Console.WriteLine("Processing " + fileName);
                 *  SheetLoader sheetLoader = new SheetLoader(Config.SHEETDIR + "/" + fileName);
                 *  foreach(Tuple<string, MSheet> sheetDict in sheetLoader.FetchSheetDict())
                 *  {
                 *      string sheetName = sheetDict.Item1;
                 *      MSheet mSheet = sheetDict.Item2;
                 *      this.ProcessTable(fileName, sheetName, mSheet);
                 *      yield return mSheet;
                 *  }
                 ++counter;
                 *  if (counter % 100 == 0)
                 *  {
                 *      Console.WriteLine("Current: " + counter);
                 *  }
                 * }
                 * catch (Exception e)
                 * {
                 *  Console.WriteLine("Error while processing " + fileName + " " + e.Message);
                 * }
                 */
                Console.WriteLine("Processing " + fileName);
                SheetLoader sheetLoader = new SheetLoader(Config.SHEETDIR + "/" + fileName);
                foreach (Tuple <string, MSheet> sheetDict in sheetLoader.FetchSheetDict())
                {
                    string sheetName = sheetDict.Item1;
                    MSheet mSheet    = sheetDict.Item2;
                    this.ProcessTable(fileName, sheetName, mSheet);
                    yield return(new Tuple <string, string, MSheet>(fileName, sheetName, mSheet));
                }
                sheetLoader.CloseWorkbook();
                ++counter;
                if (counter % 100 == 0)
                {
                    Console.WriteLine("Current: " + counter);
                }
            }
        }
Exemplo n.º 5
0
 private void linkToRoot(MSheet mSheet)
 {
     foreach (Tuple <int, int> key in mSheet.Nodes.Keys)
     {
         HeaderNode node = mSheet.Nodes[key];
         if (!node.HasParent())
         {
             mSheet.RootNode.AddChild(node);
             mSheet.Nodes[key].Parent = mSheet.RootNode;
         }
     }
     // this.printStructure(mSheet.RootNode, "");
 }
Exemplo n.º 6
0
        /*
         * This method shows how to use the frame finder.
         */
        public static void ProcessEachTable()
        {
            CRFRunner crfRunner = new CRFRunner();

            crfRunner.Train();
            PredictSheetRow predict = new PredictSheetRow();

            foreach (Tuple <string, string, MSheet> eachTable in predict.ScanEachExcel())
            {
                string workbookName = eachTable.Item1;
                string sheetName    = eachTable.Item2;
                MSheet mSheet       = eachTable.Item3;
                crfRunner.Predict(workbookName, sheetName);
                TransformOutput.Run(workbookName, sheetName, mSheet);
                // PrintLabel(mSheet.Labels);
                HorizontalHierarchyExtractor hhe = new HorizontalHierarchyExtractor();
                hhe.ExtractFromMSheet(mSheet);
            }
        }
Exemplo n.º 7
0
 private void initNodes(MSheet mSheet, int rowNum, int colNum)
 {
     mSheet.InitNodes();
     bool[,] vis = new bool[rowNum, colNum];
     for (int i = 0; i < rowNum; ++i)
     {
         for (int j = 0; j < colNum; ++j)
         {
             vis[i, j] = false;
         }
     }
     for (int j = mSheet.StartCol; j < mSheet.StartCol + mSheet.ColNum; ++j)
     {
         for (int i = mSheet.StartRow; i < mSheet.StartRow + mSheet.RowNum; ++i)
         {
             if (vis[i - mSheet.StartRow, j - mSheet.StartCol])
             {
                 continue;
             }
             if (mSheet.Labels.Keys.Contains(i) && mSheet.Labels[i] == RowLabel.Header)
             {
                 Range cell = mSheet.Cells.Cells[i, j];
                 int   row  = i;
                 int   col  = j;
                 if (cell.MergeCells)
                 {
                     row = cell.MergeArea.Row;
                     col = cell.MergeArea.Column;
                     for (int ii = 0; ii < cell.MergeArea.Rows.Count; ++ii)
                     {
                         for (int jj = 0; jj < cell.MergeArea.Columns.Count; ++jj)
                         {
                             vis[row + ii - mSheet.StartRow, col + jj - mSheet.StartCol] = true;
                         }
                     }
                 }
                 vis[row - mSheet.StartRow, col - mSheet.StartCol] = true;
                 mSheet.Nodes.Add(new Tuple <int, int>(row, col), new HeaderNode(row, col));
             }
         }
     }
 }
Exemplo n.º 8
0
        private List <bool> generateFeatureByRowCRF(int crow, Dictionary <int, MCell> rowCellDict,
                                                    MSheet mSheet, bool blankFlag)
        {
            List <bool> feavec    = new List <bool>();
            String      cLineText = "";

            foreach (int ccol in rowCellDict.Keys)
            {
                MCell mCell = rowCellDict[ccol];
                cLineText += mCell.Value + " ";
            }
            // layout feature
            feavec.Add(blankFlag);
            feavec.Add(this.featureHasMergedCell(crow, mSheet));
            feavec.Add(this.featureReachRightBound(rowCellDict, mSheet.MaxColNum));
            feavec.Add(this.featureReachLeftBound(rowCellDict));
            feavec.Add(this.featureIsOneColumn(rowCellDict));
            feavec.Add(this.featureHasCenterAlignCell(rowCellDict));
            feavec.Add(this.featureHasLeftAlignCell(rowCellDict));
            feavec.Add(this.featureHasBoldFontCell(rowCellDict));
            feavec.Add(this.featureIndentation(cLineText));

            // textual feature
            feavec.Add(this.featureStartWithTable(cLineText));
            feavec.Add(this.featureStartWithPunctation(cLineText));
            feavec.Add(this.featureNumberPercentHigh(rowCellDict));
            feavec.Add(this.featureDigitalPercentHigh(rowCellDict));
            feavec.Add(this.featureAlphabetaAllCapital(cLineText));
            feavec.Add(this.featureAlphabetaStartWithCapital(rowCellDict));
            feavec.Add(this.featureAlphabetaStartWithLowercase(rowCellDict));
            feavec.Add(this.featureAlphabetaCellnumPercentHigh(rowCellDict));
            feavec.Add(this.featureAlphabetaPercentHigh(cLineText));
            feavec.Add(this.featureContainSpecialChar(cLineText));
            feavec.Add(this.featureContainColon(cLineText)); //
            feavec.Add(this.featureYearRangeCellnumHigh(rowCellDict));
            feavec.Add(this.featureYearRangePercentHigh(rowCellDict));
            feavec.Add(this.featureWordLengthHigh(rowCellDict));
            return(feavec);
        }
Exemplo n.º 9
0
        public void ExtractFromMSheet(MSheet mSheet)
        {
            int rowNum = mSheet.RowNum;
            int colNum = mSheet.ColNum;

            this.initNodes(mSheet, rowNum, colNum);
            bool[,] vis = new bool[rowNum, colNum];
            for (int i = 0; i < rowNum; ++i)
            {
                for (int j = 0; j < colNum; ++j)
                {
                    vis[i, j] = false;
                }
            }
            int upRow    = mSheet.StartRow;
            int leftCol  = mSheet.StartCol;
            int downRow  = mSheet.StartRow + mSheet.RowNum - 1;
            int rightCol = mSheet.StartCol + mSheet.ColNum - 1;

            for (int j = leftCol; j <= rightCol; ++j)
            {
                for (int i = upRow; i <= downRow; ++i)
                {
                    if (vis[i - upRow, j - leftCol])
                    {
                        continue;
                    }
                    Range cell = mSheet.Cells.Cells[i, j];
                    int   row  = i;
                    int   col  = j;
                    if (cell.MergeCells)
                    {
                        row = cell.MergeArea.Row;
                        col = cell.MergeArea.Column;
                        for (int ii = 0; ii < cell.MergeArea.Rows.Count; ++ii)
                        {
                            for (int jj = 0; jj < cell.MergeArea.Columns.Count; ++jj)
                            {
                                vis[row + ii - mSheet.StartRow, col + jj - mSheet.StartCol] = true;
                            }
                        }
                    }
                    vis[i - upRow, j - leftCol] = true;
                    Tuple <int, int> tmp = new Tuple <int, int>(row, col);
                    if (mSheet.Nodes.Keys.Contains(tmp))
                    {
                        int   preRow  = row - 1;
                        int   preCol  = col;
                        Range preCell = mSheet.Cells.Cells[preRow, preCol];
                        if (preCell.MergeCells)
                        {
                            preRow = preCell.MergeArea.Row;
                            preCol = preCell.MergeArea.Column;
                        }
                        Tuple <int, int> preTmp = new Tuple <int, int>(preRow, preCol);
                        if (mSheet.Nodes.Keys.Contains(preTmp))
                        {
                            mSheet.Nodes[preTmp].AddChild(mSheet.Nodes[tmp]);
                            mSheet.Nodes[tmp].Parent = mSheet.Nodes[preTmp];
                        }
                    }
                }
            }
            this.linkToRoot(mSheet);
        }
Exemplo n.º 10
0
        public IEnumerable <Tuple <string, MSheet> > FetchSheetDict()
        {
            Sheets sheets = this.workbook.Sheets;

            foreach (Worksheet sheet in sheets)
            {
                string sheetName = sheet.Name;
                List <Tuple <int, int, int, int> > ranges = TableHelper.SplitTable(sheet);
                foreach (Tuple <int, int, int, int> range in ranges)
                {
                    MSheet mSheet   = new MSheet();
                    int    upRow    = range.Item1;
                    int    leftCol  = range.Item2;
                    int    downRow  = range.Item3;
                    int    rightCol = range.Item4;
                    int    rowNum   = downRow - upRow + 1;
                    int    colNum   = rightCol - leftCol + 1;
                    mSheet.StartRow = upRow;
                    mSheet.StartCol = leftCol;
                    mSheet.RowNum   = rowNum;
                    mSheet.ColNum   = colNum;
                    {
                        Range  beginCell    = sheet.Cells[upRow, leftCol];
                        Range  endCell      = sheet.Cells[downRow, rightCol];
                        string beginAddress = beginCell.get_Address().Replace("$", "");
                        string endAddress   = endCell.get_Address().Replace("$", "");
                        mSheet.Cells = sheet.get_Range(beginAddress, endAddress);
                    }
                    bool[,] vis           = new bool[rowNum, colNum];
                    DataType[,] typeTable = new DataType[rowNum, colNum];
                    for (int i = 0; i < rowNum; ++i)
                    {
                        for (int j = 0; j < colNum; ++j)
                        {
                            vis[i, j]       = false;
                            typeTable[i, j] = DataType.NONE;
                        }
                    }

                    for (int row = upRow; row <= downRow; ++row)
                    {
                        for (int col = leftCol; col <= rightCol; ++col)
                        {
                            if (vis[row - upRow, col - leftCol])
                            {
                                continue;
                            }
                            Range cell = sheet.Cells[row, col];
                            if (cell.MergeCells)
                            {
                                int rowCount = cell.MergeArea.Rows.Count;
                                int colCount = cell.MergeArea.Columns.Count;
                                mSheet.AddMergeCell(row, row + rowCount - 1, col, col + colCount - 1);
                                for (int i = row; i < row + rowCount; ++i)
                                {
                                    for (int j = col; j < col + colCount; ++j)
                                    {
                                        vis[i - upRow, j - leftCol] = true;
                                    }
                                }
                            }
                            string cellValue = Convert.ToString(cell.Value2);
                            string cellType  = (cell.NumberFormat as string);
                            if (cellValue == null || cellValue.Length == 0)
                            {
                                continue;
                            }
                            string cType   = this.getValueType(cellValue);
                            int    indents = cell.IndentLevel;

                            /* XlHAlign
                             * -4131 = xlHAlignLeft                  -> ALIGN_LEFT = 0x1
                             * -4152 = xlHAlignRight                 -> ALIGN_RIGHT = 0x3
                             * -4108 = xlHAlignCenter                -> ALIGN_CENTER = 0x2
                             * -4130 = xlHAlignJustify               -> ALIGN_JUSTIFY = 0x5
                             * -4117 = xlHAlignDistributed           ->
                             * 1     = xlHAlignGeneral               -> ALIGN_GENERAL = 0x0
                             * 5     = xlHAlignFill                  -> ALIGN_FILL = 0x4
                             * 7     = xlHAlignCenterAcrossSelection ->
                             */
                            // int alignStyle = cell.HorizontalAlignment;
                            // int alignStyle = this.getFeatureAlignStyle(cell.HorizontalAlignment);
                            int alignStyle = cell.HorizontalAlignment;

                            /* XlLineStyle
                             * -4142 = xlLineStyleNone -> BORDER_NONE = 0x0
                             * -4119 = xlDouble        -> BORDER_DOUBLE = 0x6
                             * -4118 = xlDot           -> BORDER_HAIR = 0x7
                             * -4115 = xlDash          -> BORDER_DASHED = 0x3
                             * 1     = xlContinuous
                             * 4     = xlDashDot       -> BORDER_DASH_DOT = 0x9
                             * 5     = xlDashDotDot    -> BORDER_DASH_DOT_DOT = 0xB
                             * 13    = xlSlantDashDot  -> BORDER_SLANTED_DASH_DOT = 0xD
                             */
                            string borderStyle = this.getFeatureBorderStyle(cell.Borders);

                            /* XlColorIndex
                             * -4142 = xlColorIndexNone
                             * -4105 = xlColorIndexAutomatic
                             */
                            double bgColor    = cell.Interior.ColorIndex;
                            int    boldFlag   = this.getFeatureFontBold(cell.Font);
                            double height     = this.getFeatureFontHeight(cell.Font) * 20.0;
                            int    italicFlag = this.getFeatureFontItalic(cell.Font);
                            // XlUnderlineStyle
                            int      underlineFlag = this.getFeatureFontUnderline(cell.Font);
                            DataType dataType      = this.getDataType(cell.NumberFormat as string, cellValue);
                            mSheet.InsertCell(row, col, cType, indents, alignStyle, borderStyle,
                                              (int)bgColor, boldFlag, (int)height, italicFlag, underlineFlag, cellValue);
                            typeTable[row - upRow, col - leftCol] = dataType;
                        }
                    }
                    DataType[] columnTypes = new DataType[colNum];
                    this.findColumnType(typeTable, rowNum, colNum, columnTypes);
                    mSheet.SetColumnTypeTable(columnTypes, colNum);
                    yield return(new Tuple <string, MSheet>(sheetName, mSheet));
                }
            }
        }
Exemplo n.º 11
0
        // This method is out of date and should not be used!
        private Dictionary <string, MSheet> LoadSheetDictByTransposition()
        {
            Dictionary <string, MSheet> sheetDict = new Dictionary <string, MSheet>();
            Sheets sheets = this.workbook.Sheets;

            foreach (Worksheet sheet in sheets)
            {
                // List<Tuple<int, int, int, int>> ranges = TableHelper.SplitTable(sheet);
                string sheetName = sheet.Name;
                MSheet mSheet    = new MSheet();
                int    rowNum    = sheet.UsedRange.Cells.Rows.Count;
                int    colNum    = sheet.UsedRange.Cells.Columns.Count;
                int    stRow     = sheet.UsedRange.Row;
                int    stCol     = sheet.UsedRange.Column;
                mSheet.StartRow = stRow;
                mSheet.StartCol = stCol;
                mSheet.RowNum   = rowNum;
                mSheet.ColNum   = colNum;
                bool[,] vis     = new bool[rowNum, colNum];
                for (int i = 0; i < rowNum; ++i)
                {
                    for (int j = 0; j < colNum; ++j)
                    {
                        vis[i, j] = false;
                    }
                }

                for (int row = stRow; row < stRow + rowNum; ++row)
                {
                    for (int col = stCol; col < stCol + colNum; ++col)
                    {
                        Range cell = sheet.Cells[row, col];

                        if (vis[row - stRow, col - stCol])
                        {
                            continue;
                        }

                        if (cell.MergeCells)
                        {
                            int rowCount = cell.MergeArea.Rows.Count;
                            int colCount = cell.MergeArea.Columns.Count;
                            mSheet.AddMergeCell(col, col + colCount - 1, row, row + rowCount - 1);
                            for (int i = row; i < row + rowCount; ++i)
                            {
                                for (int j = col; j < col + colCount; ++j)
                                {
                                    vis[i - stRow, j - stCol] = true;
                                }
                            }
                        }
                        string cellValue = Convert.ToString(cell.Value2);
                        string cellType  = (cell.NumberFormat as string);
                        if (cellValue == null || cellValue.Length == 0)
                        {
                            continue;
                        }

                        /*
                         * 0.00 means precision is 2
                         * #,## means to use , delimiter
                         */
                        string cType   = this.getValueType(cellValue);
                        string cStr    = cellValue;
                        int    indents = cell.IndentLevel;

                        /* XlHAlign
                         * -4131 = xlHAlignLeft                  -> ALIGN_LEFT = 0x1
                         * -4152 = xlHAlignRight                 -> ALIGN_RIGHT = 0x3
                         * -4108 = xlHAlignCenter                -> ALIGN_CENTER = 0x2
                         * -4130 = xlHAlignJustify               -> ALIGN_JUSTIFY = 0x5
                         * -4117 = xlHAlignDistributed           ->
                         * 1     = xlHAlignGeneral               -> ALIGN_GENERAL = 0x0
                         * 5     = xlHAlignFill                  -> ALIGN_FILL = 0x4
                         * 7     = xlHAlignCenterAcrossSelection ->
                         */
                        // int alignStyle = cell.HorizontalAlignment;
                        // int alignStyle = this.getFeatureAlignStyle(cell.HorizontalAlignment);
                        int alignStyle = cell.HorizontalAlignment;

                        /* XlLineStyle
                         * -4142 = xlLineStyleNone -> BORDER_NONE = 0x0
                         * -4119 = xlDouble        -> BORDER_DOUBLE = 0x6
                         * -4118 = xlDot           -> BORDER_HAIR = 0x7
                         * -4115 = xlDash          -> BORDER_DASHED = 0x3
                         * 1     = xlContinuous
                         * 4     = xlDashDot       -> BORDER_DASH_DOT = 0x9
                         * 5     = xlDashDotDot    -> BORDER_DASH_DOT_DOT = 0xB
                         * 13    = xlSlantDashDot  -> BORDER_SLANTED_DASH_DOT = 0xD
                         */
                        string borderStyle = this.getFeatureBorderStyle(cell.Borders);

                        /* XlColorIndex
                         * -4142 = xlColorIndexNone
                         * -4105 = xlColorIndexAutomatic
                         */
                        double bgColor    = cell.Interior.ColorIndex;
                        int    boldFlag   = this.getFeatureFontBold(cell.Font);
                        double height     = this.getFeatureFontHeight(cell.Font) * 20.0;
                        int    italicFlag = this.getFeatureFontItalic(cell.Font);
                        // XlUnderlineStyle
                        int underlineFlag = this.getFeatureFontUnderline(cell.Font);
                        mSheet.InsertCell(col, row, cType, indents, alignStyle, borderStyle,
                                          (int)bgColor, boldFlag, (int)height, italicFlag, underlineFlag, cStr);
                    }
                }
                sheetDict.Add(sheetName, mSheet);
            }
            return(sheetDict);
        }
Exemplo n.º 12
0
 private bool featureHasMergedCell(int crow, MSheet mSheet)
 {
     return(mSheet.MergeRowSet.Contains(crow));
 }