/// <summary> /// Initialize DictMatch Feature Generator /// </summary> /// <returns></returns> public bool Initialize() { dictmatch = new DictMatch(); dm_r = new List <Lemma>(); dm_offsetList = new List <int>(); Dictionary <string, string> configDict; configDict = LoadConfigFile("GenerateFeatureDictMatch.ini"); if (configDict.ContainsKey(KEY_LEXICAL_DICT_FILE_NAME.ToLower()) == false || configDict.ContainsKey(KEY_BINARY_DICT_TYPE.ToLower()) == false) { return(false); } var strDictMatchFileName = configDict[KEY_LEXICAL_DICT_FILE_NAME.ToLower()]; var bBinaryDict = bool.Parse(configDict[KEY_BINARY_DICT_TYPE.ToLower()]); if (strDictMatchFileName.Length == 0) { return(true); } if (bBinaryDict == true) { dictmatch.LoadDictFromBinary(strDictMatchFileName); } else { dictmatch.LoadDictFromRawText(strDictMatchFileName); } return(true); }
//Read each line from strTextFileName, and verify wether terms in every line are in strDictFileName public static void Match(string strTextFileName, DictMatch match) { List <Lemma> dm_r = new List <Lemma>(); List <int> offsetList = new List <int>(); StreamReader sr = new StreamReader(strTextFileName); while (sr.EndOfStream == false) { string strLine = sr.ReadLine(); if (strLine.Length == 0) { continue; } dm_r.Clear(); offsetList.Clear(); match.Search(strLine, ref dm_r, ref offsetList, DictMatch.DM_OUT_FMM); //if dm_r.Count > 0, it means some contigous terms in strLine have matched terms in the dictionary. for (int i = 0; i < dm_r.Count; i++) { uint len = dm_r[i].len; int offset = offsetList[i]; string strProp = dm_r[i].strProp; string strTerm = strLine.Substring(offset, (int)len); Console.WriteLine("Matched term: {0}[offset:{1}, len:{2}, prop:{3}]", strTerm, offset, len, strProp); } } sr.Close(); }
public static void VerifyRawTextDict(string strTestFileName, string strRawDictFileName) { Console.WriteLine("Load raw text dictionary..."); DictMatch match = new DictMatch(); match.LoadDictFromRawText(strRawDictFileName); Console.WriteLine("Verify raw text dictionary..."); Match(strTestFileName, match); }
public static void DictMatchSequences(string inputFilePath, string outputFilePath, string dictFilePath) { Console.WriteLine("Load raw text dictionary..."); DictMatch match = new DictMatch(); match.LoadDictFromRawText(dictFilePath); Console.WriteLine("Verify raw text dictionary..."); Match(inputFilePath, outputFilePath, match); }
//Read each line from strTextFileName, and verify wether terms in every line are in strDictFileName public static void Match(string inputFilePath, string outputFilePath, DictMatch match) { List <Lemma> dm_r = new List <Lemma>(); List <int> offsetList = new List <int>(); StreamReader sr = new StreamReader(inputFilePath); StreamWriter sw = new StreamWriter(outputFilePath); while (sr.EndOfStream == false) { string?line = sr.ReadLine(); if (line == null || line.Length == 0) { continue; } dm_r.Clear(); offsetList.Clear(); match.Search(line, ref dm_r, ref offsetList, DictMatch.DM_OUT_FMM); //if dm_r.Count > 0, it means some contigous terms in strLine have matched terms in the dictionary. StringBuilder sb = new StringBuilder(); int currOffset = 0; for (int i = 0; i < dm_r.Count; i++) { uint len = dm_r[i].len; int offset = offsetList[i]; string strProp = dm_r[i].strProp; string strTerm = line.Substring(offset, (int)len); if (offset > currOffset) { sb.Append(line.Substring(currOffset, offset - currOffset)); } sb.Append($" <{strProp}> {strTerm} </{strProp}> "); currOffset = (int)(offset + len); } if (currOffset < line.Length) { sb.Append(line.Substring(currOffset)); } sw.WriteLine(sb.ToString().Replace(" ", " ")); } sr.Close(); sw.Close(); }
public static void VerifyBinaryDict(string strTestFileName, string strRawDictFileName) { Console.WriteLine("Convert dictionary from raw text to binary format."); DictMatch match = new DictMatch(); match.ConvertDictFromRawTextToBinary(strRawDictFileName, strRawDictFileName + ".bin"); Console.WriteLine("Load binary dictionary..."); match = new DictMatch(); match.LoadDictFromBinary(strRawDictFileName + ".bin"); Console.WriteLine("Verify binary dictionary..."); Match(strTestFileName, match); }