示例#1
0
        private bool featureDigitalPercentHigh(Dictionary <int, MCell> rowCellDict)
        {
            if (rowCellDict.Count() == 0)
            {
                return(false);
            }
            int numberCount = 0;

            foreach (int col in rowCellDict.Keys)
            {
                MCell mCell = rowCellDict[col];
                if (this.isNumber(mCell.Value))
                {
                    ++numberCount;
                }
                else if (this.isNa(mCell.Value))
                {
                    ++numberCount;
                }
            }
            if ((double)numberCount / (double)rowCellDict.Count() >= 0.6)
            {
                return(true);
            }
            return(false);
        }
示例#2
0
        private bool featureYearRangePercentHigh(Dictionary <int, MCell> rowCellDict)
        {
            if (rowCellDict.Count() == 0)
            {
                return(false);
            }
            int yearCount = 0;
            // TODO total should be 1 or 0?
            int total = 1;

            foreach (int col in rowCellDict.Keys)
            {
                MCell         mCell  = rowCellDict[col];
                List <double> numArr = this.getNumberSet(mCell.Value);
                total += numArr.Count();
                foreach (double year in numArr)
                {
                    if (year >= 1800 && year <= 2300)
                    {
                        ++yearCount;
                    }
                }
            }
            return((double)yearCount / (double)total >= 0.7);
        }
示例#3
0
        public Dictionary <int, List <bool> > GenerateSingularFeatureCRF(MSheet mSheet)
        {
            Dictionary <int, List <bool> > feaDict = new Dictionary <int, List <bool> >();

            for (int i = mSheet.StartRow; i < mSheet.StartRow + mSheet.RowNum; ++i)
            {
                Dictionary <int, MCell> rowCellDict = new Dictionary <int, MCell>();
                for (int j = mSheet.StartCol; j < mSheet.StartCol + mSheet.ColNum; ++j)
                {
                    Tuple <int, int> tuple = new Tuple <int, int>(i, j);
                    if (mSheet.SheetDict.ContainsKey(tuple))
                    {
                        MCell mCell = mSheet.SheetDict[tuple];
                        rowCellDict.Add(j, mCell);
                    }
                }
                if (rowCellDict.Count() == 0)
                {
                    continue;
                }
                bool blankFlag = false;
                if (feaDict.ContainsKey(i - 1))
                {
                    blankFlag = false;
                }
                else
                {
                    blankFlag = true;
                }
                feaDict.Add(i, this.generateFeatureByRowCRF(i, rowCellDict, mSheet, blankFlag));
            }
            return(feaDict);
        }
示例#4
0
        public void InsertCell(int rowIdx, int colIdx, string cType, int indents,
                               int alignStyle, string borderStyle, int bgColor, int boldFlag,
                               int height, int italicFlag, int underlineFlag, string value)
        {
            MCell mCell = new MCell();

            mCell.Init(value, cType, indents, alignStyle, boldFlag, borderStyle, bgColor,
                       height, italicFlag, underlineFlag);
            this.SheetDict.Add(new Tuple <int, int>(rowIdx, colIdx), mCell);
            if (cType.Equals("str"))
            {
                this.txt += value + " ";
            }
            this.maxRowNum = Math.Max(this.maxRowNum, rowIdx);
            this.maxColNum = Math.Max(this.maxColNum, colIdx);
        }
示例#5
0
 private bool featureWordLengthHigh(Dictionary <int, MCell> rowCellDict)
 {
     if (rowCellDict.Count() != 1)
     {
         return(false);
     }
     foreach (int col in rowCellDict.Keys)
     {
         MCell mCell = rowCellDict[col];
         if (mCell.Value.Length > 40)
         {
             return(true);
         }
     }
     return(false);
 }
示例#6
0
        private bool featureAlphabetaCellnumPercentHigh(Dictionary <int, MCell> rowCellDict)
        {
            int   counter = 0;
            Regex regex   = new Regex(@"[A-Za-z]");

            foreach (int col in rowCellDict.Keys)
            {
                MCell mCell = rowCellDict[col];
                if (!mCell.Type.Equals("str"))
                {
                    continue;
                }
                if (regex.Match(mCell.Value).Success)
                {
                    ++counter;
                }
            }
            return((double)counter / (double)rowCellDict.Count() >= 0.6);
        }
示例#7
0
 private bool featureAlphabetaStartWithCapital(Dictionary <int, MCell> rowCellDict)
 {
     foreach (int col in rowCellDict.Keys)
     {
         MCell mCell = rowCellDict[col];
         if (!mCell.Type.Equals("str"))
         {
             continue;
         }
         if (mCell.Value.Length == 0)
         {
             continue;
         }
         if (this.hasLetter(mCell.Value) && !(mCell.Value[0] >= 'A' && mCell.Value[0] <= 'Z'))
         {
             return(false);
         }
     }
     return(true);
 }
示例#8
0
        private List <bool> generateFeatureByRowCRF(int crow, Dictionary <int, MCell> rowCellDict,
                                                    MSheet mSheet, bool blankFlag)
        {
            List <bool> feavec    = new List <bool>();
            String      cLineText = "";

            foreach (int ccol in rowCellDict.Keys)
            {
                MCell mCell = rowCellDict[ccol];
                cLineText += mCell.Value + " ";
            }
            // layout feature
            feavec.Add(blankFlag);
            feavec.Add(this.featureHasMergedCell(crow, mSheet));
            feavec.Add(this.featureReachRightBound(rowCellDict, mSheet.MaxColNum));
            feavec.Add(this.featureReachLeftBound(rowCellDict));
            feavec.Add(this.featureIsOneColumn(rowCellDict));
            feavec.Add(this.featureHasCenterAlignCell(rowCellDict));
            feavec.Add(this.featureHasLeftAlignCell(rowCellDict));
            feavec.Add(this.featureHasBoldFontCell(rowCellDict));
            feavec.Add(this.featureIndentation(cLineText));

            // textual feature
            feavec.Add(this.featureStartWithTable(cLineText));
            feavec.Add(this.featureStartWithPunctation(cLineText));
            feavec.Add(this.featureNumberPercentHigh(rowCellDict));
            feavec.Add(this.featureDigitalPercentHigh(rowCellDict));
            feavec.Add(this.featureAlphabetaAllCapital(cLineText));
            feavec.Add(this.featureAlphabetaStartWithCapital(rowCellDict));
            feavec.Add(this.featureAlphabetaStartWithLowercase(rowCellDict));
            feavec.Add(this.featureAlphabetaCellnumPercentHigh(rowCellDict));
            feavec.Add(this.featureAlphabetaPercentHigh(cLineText));
            feavec.Add(this.featureContainSpecialChar(cLineText));
            feavec.Add(this.featureContainColon(cLineText)); //
            feavec.Add(this.featureYearRangeCellnumHigh(rowCellDict));
            feavec.Add(this.featureYearRangePercentHigh(rowCellDict));
            feavec.Add(this.featureWordLengthHigh(rowCellDict));
            return(feavec);
        }
示例#9
0
        private bool featureYearRangeCellnumHigh(Dictionary <int, MCell> rowCellDict)
        {
            if (rowCellDict.Count() == 0)
            {
                return(false);
            }
            int yearCount = 0;

            foreach (int col in rowCellDict.Keys)
            {
                MCell         mCell  = rowCellDict[col];
                List <double> numArr = this.getNumberSet(mCell.Value);
                foreach (double year in numArr)
                {
                    if (year >= 1800 && year <= 2300)
                    {
                        ++yearCount;
                    }
                }
            }
            return(yearCount >= 3);
        }
示例#10
0
        private bool featureAlphabetaStartWithLowercase(Dictionary <int, MCell> rowCellDict)
        {
            int col = int.MaxValue;

            foreach (int key in rowCellDict.Keys)
            {
                col = Math.Min(col, key);
            }
            MCell mCell = rowCellDict[col];

            if (mCell.Value.Length == 0)
            {
                return(false);
            }
            char c = mCell.Value[0];

            if (this.hasLetter(mCell.Value) && (c >= 'a' && c <= 'z'))
            {
                return(true);
            }
            return(false);
        }