private bool featureDigitalPercentHigh(Dictionary <int, MCell> rowCellDict) { if (rowCellDict.Count() == 0) { return(false); } int numberCount = 0; foreach (int col in rowCellDict.Keys) { MCell mCell = rowCellDict[col]; if (this.isNumber(mCell.Value)) { ++numberCount; } else if (this.isNa(mCell.Value)) { ++numberCount; } } if ((double)numberCount / (double)rowCellDict.Count() >= 0.6) { return(true); } return(false); }
private bool featureYearRangePercentHigh(Dictionary <int, MCell> rowCellDict) { if (rowCellDict.Count() == 0) { return(false); } int yearCount = 0; // TODO total should be 1 or 0? int total = 1; foreach (int col in rowCellDict.Keys) { MCell mCell = rowCellDict[col]; List <double> numArr = this.getNumberSet(mCell.Value); total += numArr.Count(); foreach (double year in numArr) { if (year >= 1800 && year <= 2300) { ++yearCount; } } } return((double)yearCount / (double)total >= 0.7); }
public Dictionary <int, List <bool> > GenerateSingularFeatureCRF(MSheet mSheet) { Dictionary <int, List <bool> > feaDict = new Dictionary <int, List <bool> >(); for (int i = mSheet.StartRow; i < mSheet.StartRow + mSheet.RowNum; ++i) { Dictionary <int, MCell> rowCellDict = new Dictionary <int, MCell>(); for (int j = mSheet.StartCol; j < mSheet.StartCol + mSheet.ColNum; ++j) { Tuple <int, int> tuple = new Tuple <int, int>(i, j); if (mSheet.SheetDict.ContainsKey(tuple)) { MCell mCell = mSheet.SheetDict[tuple]; rowCellDict.Add(j, mCell); } } if (rowCellDict.Count() == 0) { continue; } bool blankFlag = false; if (feaDict.ContainsKey(i - 1)) { blankFlag = false; } else { blankFlag = true; } feaDict.Add(i, this.generateFeatureByRowCRF(i, rowCellDict, mSheet, blankFlag)); } return(feaDict); }
public void InsertCell(int rowIdx, int colIdx, string cType, int indents, int alignStyle, string borderStyle, int bgColor, int boldFlag, int height, int italicFlag, int underlineFlag, string value) { MCell mCell = new MCell(); mCell.Init(value, cType, indents, alignStyle, boldFlag, borderStyle, bgColor, height, italicFlag, underlineFlag); this.SheetDict.Add(new Tuple <int, int>(rowIdx, colIdx), mCell); if (cType.Equals("str")) { this.txt += value + " "; } this.maxRowNum = Math.Max(this.maxRowNum, rowIdx); this.maxColNum = Math.Max(this.maxColNum, colIdx); }
private bool featureWordLengthHigh(Dictionary <int, MCell> rowCellDict) { if (rowCellDict.Count() != 1) { return(false); } foreach (int col in rowCellDict.Keys) { MCell mCell = rowCellDict[col]; if (mCell.Value.Length > 40) { return(true); } } return(false); }
private bool featureAlphabetaCellnumPercentHigh(Dictionary <int, MCell> rowCellDict) { int counter = 0; Regex regex = new Regex(@"[A-Za-z]"); foreach (int col in rowCellDict.Keys) { MCell mCell = rowCellDict[col]; if (!mCell.Type.Equals("str")) { continue; } if (regex.Match(mCell.Value).Success) { ++counter; } } return((double)counter / (double)rowCellDict.Count() >= 0.6); }
private bool featureAlphabetaStartWithCapital(Dictionary <int, MCell> rowCellDict) { foreach (int col in rowCellDict.Keys) { MCell mCell = rowCellDict[col]; if (!mCell.Type.Equals("str")) { continue; } if (mCell.Value.Length == 0) { continue; } if (this.hasLetter(mCell.Value) && !(mCell.Value[0] >= 'A' && mCell.Value[0] <= 'Z')) { return(false); } } return(true); }
private List <bool> generateFeatureByRowCRF(int crow, Dictionary <int, MCell> rowCellDict, MSheet mSheet, bool blankFlag) { List <bool> feavec = new List <bool>(); String cLineText = ""; foreach (int ccol in rowCellDict.Keys) { MCell mCell = rowCellDict[ccol]; cLineText += mCell.Value + " "; } // layout feature feavec.Add(blankFlag); feavec.Add(this.featureHasMergedCell(crow, mSheet)); feavec.Add(this.featureReachRightBound(rowCellDict, mSheet.MaxColNum)); feavec.Add(this.featureReachLeftBound(rowCellDict)); feavec.Add(this.featureIsOneColumn(rowCellDict)); feavec.Add(this.featureHasCenterAlignCell(rowCellDict)); feavec.Add(this.featureHasLeftAlignCell(rowCellDict)); feavec.Add(this.featureHasBoldFontCell(rowCellDict)); feavec.Add(this.featureIndentation(cLineText)); // textual feature feavec.Add(this.featureStartWithTable(cLineText)); feavec.Add(this.featureStartWithPunctation(cLineText)); feavec.Add(this.featureNumberPercentHigh(rowCellDict)); feavec.Add(this.featureDigitalPercentHigh(rowCellDict)); feavec.Add(this.featureAlphabetaAllCapital(cLineText)); feavec.Add(this.featureAlphabetaStartWithCapital(rowCellDict)); feavec.Add(this.featureAlphabetaStartWithLowercase(rowCellDict)); feavec.Add(this.featureAlphabetaCellnumPercentHigh(rowCellDict)); feavec.Add(this.featureAlphabetaPercentHigh(cLineText)); feavec.Add(this.featureContainSpecialChar(cLineText)); feavec.Add(this.featureContainColon(cLineText)); // feavec.Add(this.featureYearRangeCellnumHigh(rowCellDict)); feavec.Add(this.featureYearRangePercentHigh(rowCellDict)); feavec.Add(this.featureWordLengthHigh(rowCellDict)); return(feavec); }
private bool featureYearRangeCellnumHigh(Dictionary <int, MCell> rowCellDict) { if (rowCellDict.Count() == 0) { return(false); } int yearCount = 0; foreach (int col in rowCellDict.Keys) { MCell mCell = rowCellDict[col]; List <double> numArr = this.getNumberSet(mCell.Value); foreach (double year in numArr) { if (year >= 1800 && year <= 2300) { ++yearCount; } } } return(yearCount >= 3); }
private bool featureAlphabetaStartWithLowercase(Dictionary <int, MCell> rowCellDict) { int col = int.MaxValue; foreach (int key in rowCellDict.Keys) { col = Math.Min(col, key); } MCell mCell = rowCellDict[col]; if (mCell.Value.Length == 0) { return(false); } char c = mCell.Value[0]; if (this.hasLetter(mCell.Value) && (c >= 'a' && c <= 'z')) { return(true); } return(false); }