/// <summary> /// Load the hierarchy of interest types. /// </summary> private void LoadHierarchy() { FileReader reader = new LargeFileReader(hierarchyFile); this.low2top = new Dictionary <string, string>(); string line; int count = 0; while ((line = reader.ReadLine()) != null) { count++; if (count == 120) { Console.Write("debug!"); } var array = line.Split(new char[] { '\t' }, StringSplitOptions.RemoveEmptyEntries); if (line.Equals("medicine.drug")) { Console.Write(line); } low2top[array[0]] = array[0]; for (int i = 1; i < array.Length; i++) { low2top[array[i]] = array[0]; } } reader.Close(); }
private void LoadVectors(int size, int dimension) { FileReader reader = new LargeFileReader(vectorFile); this.vectors = new double[size][]; this.words = new List <string>(); string line; int index = 0; while ((line = reader.ReadLine()) != null) { line = line.Trim(); var array = line.Split(this.seperator); if (array.Length != dimension + 1) { continue; } words.Add(array[0]); var vector = new double[dimension]; for (int i = 1; i < array.Length; i++) { vector[i - 1] = double.Parse(array[i]); } this.vectors[index] = vector; index++; } reader.Close(); }
/// <summary> /// Extract features for bayes model /// </summary> /// <param name="source"> /// File path storing the data from which this program extract features. /// </param> /// <param name="des"> /// File path to store the extracted features. /// </param> private static void ExtractBayesFeature(string source, string des) { FileReader reader = new LargeFileReader(source); FileWriter writer = new LargeFileWriter(des, FileMode.Create); var lines = reader.ReadAllLines().ToList(); const int numPerThread = 10000; var threadNum = (int)Math.Ceiling(1.0 * lines.Count / numPerThread); var childThreads = new Thread[threadNum]; var tmpFiles = new string[threadNum]; for (var i = 0; i < threadNum; i++) { tmpFiles[i] = "./tmp" + i + ".txt"; var threadClass = new BayesFeatureThread(lines.GetRange(numPerThread * i, Math.Min(numPerThread, lines.Count - numPerThread * i)), tmpFiles[i]); childThreads[i] = new Thread(threadClass.ThreadMain); childThreads[i].Name = "thread " + i; childThreads[i].Start(); } for (var i = 0; i < threadNum; i++) { childThreads[i].Join(); } foreach (var tmpFile in tmpFiles) { var text = File.ReadAllText(tmpFile); writer.Write(text); File.Delete(tmpFile); } }
internal static HashSet <string> GetFields(string sourceFile) { FileReader reader = new LargeFileReader(sourceFile); string line; var count = 0; var dic = new Dictionary <string, int>(); while ((line = reader.ReadLine()) != null) { if (++count > 100) { break; } var mc = fieldRegex.Matches(line); foreach (Match match in mc) { try { dic[match.Groups[1].Value] += 1; } catch (Exception) { dic[match.Groups[1].Value] = 1; } } } reader.Close(); var fields = new HashSet <string>(); foreach (var key in dic.Keys.Where(key => (1.0 * dic[key] / count) > 0.95)) { fields.Add(key); } return(fields); }
static void Temp() { var sourceDir = @"D:\Codes\Project\EntityTyping\Fine-ner\input\tmp\"; var des = @"D:\Codes\Project\EntityTyping\Fine-ner\input\keywords.txt"; var files = Directory.GetFiles(sourceDir); var reader = new LargeFileReader(); var writer = new LargeFileWriter(des, FileMode.Create); var line = ""; var keyWords = new HashSet <string>(); foreach (var file in files) { reader.Open(file); int count = 0; while ((line = reader.ReadLine()) != null) { count++; if (count > 100) { break; } var array = line.Split('\t'); keyWords.Add(array[0]); } } reader.Close(); foreach (var word in keyWords) { writer.WriteLine(word); } writer.Close(); }
/// <summary> /// Load train data from file /// </summary> /// <param name="sourceFile"> /// File path of the train data /// </param> /// <format> /// [Label] TAB [FieldName]:{[value1],[value2]...} TAB [FieldName]:{[value1],[value2]...} ... /// </format> /// <returns> /// List of object(actually a dictionary) /// [class label]-->[field name-->list of values] /// </returns> internal static List <Pair <string, Dictionary <string, List <string> > > > LoadBayesData(string sourceFile) { FileReader reader = new LargeFileReader(sourceFile); string line; var pairs = new List <Pair <string, Dictionary <string, List <string> > > >(); var labelRegex = new Regex("^[^\t]*"); var fieldRegex = new Regex(@"\t([^:]*):{([^}]*)}"); var fields = GetFields(sourceFile); var count = 0; while ((line = reader.ReadLine()) != null) { count++; var pair = new Pair <string, Dictionary <string, List <string> > >(); var dic = new Dictionary <string, List <string> >(fields.Count); pair.first = labelRegex.Match(line).Value; var mc = fieldRegex.Matches(line); foreach (Match match in mc) { if (!fields.Contains(match.Groups[1].Value)) { Console.WriteLine("Invalid format in line{0}", count); continue; } dic[match.Groups[1].Value] = match.Groups[2].Value.Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries).ToList(); } pair.second = dic; pairs.Add(pair); } reader.Close(); return(pairs); }
private static void LoadWordTable() { lock (wordTableLocker) { if (word2index == null) { FileReader reader = null; reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.word_table_file)); String line; var dic = new Dictionary <string, int>(); while ((line = reader.ReadLine()) != null) { var array = line.Split('\t'); try { var count = dic.Count; dic[array[0]] = count; } catch (Exception) { continue; } } reader.Close(); word2index = dic; } } }
/// <summary> /// Mention words are seperated by "_" /// </summary> private static void LoadMentionClusterID() { lock (mentionIDLocker) { if (mentionIdDic == null) { var dic = new Dictionary <string, int>(); FileReader reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.mention_id_file)); string line; string[] array; HashSet <int> ids = new HashSet <int>(); System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(@"_+"); while ((line = reader.ReadLine()) != null) { array = line.Split('\t'); try { var id = int.Parse(array[1]); ids.Add(id); array[0] = regex.Replace(array[0], " "); dic[array[0]] = id; } catch (Exception) { continue; } } reader.Close(); mentionClusterSize = ids.Count; mentionIdDic = dic; } } }
private static void LoadStemMap() { lock (stemmerLocker) { if (stemWordDic == null) { var dic = new Dictionary <string, string>(); FileReader reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.stem_map)); //FileReader reader = new LargeFileReader(@"D:\Codes\Project\EntityTyping\Fine-ner\input\tables\stem-word-table.txt"); string line; string[] array; while ((line = reader.ReadLine()) != null) { array = line.Split('\t'); try { dic[array[0]] = array[1]; } catch (Exception) { continue; } } reader.Close(); stemWordDic = dic; } } }
private static void LoadWordClusterID() { lock (wordIDLocker) { if (wordIdDic == null) { var dic = new Dictionary <string, int>(); FileReader reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.word_id_file)); string line; string[] array; HashSet <int> ids = new HashSet <int>(); while ((line = reader.ReadLine()) != null) { array = line.Split('\t'); try { var id = int.Parse(array[1]); ids.Add(id); dic[array[0]] = id; } catch (Exception) { continue; } } reader.Close(); wordClusterSize = ids.Count; wordIdDic = dic; } } }
/*Read Dictionary from file */ private static void LoadDictionary() { lock (dicLocker) { if (dicTypeMap == null) { FileReader reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.dic_file)); String line; List <String> list; dics = new Dictionary <string, List <string> >(); var dic = new Dictionary <String, int>(); HashSet <String> set = new HashSet <String>(); while ((line = reader.ReadLine()) != null) { list = line.Split('\t').ToList(); List <String> strs = list.GetRange(1, list.Count - 1); dics[list[0]] = strs; strs.ForEach(x => set.Add(x)); } foreach (var type in set) { dic[type] = dic.Count; } reader.Close(); dicTypeMap = dic; } } }
private int GetVectorDimension() { FileReader reader = new LargeFileReader(vectorFile); string line; char[] seperators = new char[] { '\t', ' ' }; string[] array; line = reader.ReadLine().Trim(); double d; foreach (var c in seperators) { array = line.Split(c); if (array.Length > 1 && double.TryParse(array[1], out d)) { seperator = c; break; } } if (seperator == (char)0) { throw new Exception("Cannot parse word vector file with default seperators:TAB and Space!\r" + "Please check your file format!"); } array = line.Split(seperator); return(array.Length - 1); }
private static void LoadPosTagTable() { lock (posTagLocker) { if (posTag2index == null) { var dic = new Dictionary <string, int>(); FileReader reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.posTag_table_file)); String line; while ((line = reader.ReadLine()) != null) { try { var count = dic.Count; dic[line] = count; } catch (Exception) { continue; } } reader.Close(); posTag2index = dic; } } }
public static void LoadDBpediaRedirect() { lock (dbpediaRedirectLocker) { if (redirects == null) { var dic = new Dictionary <string, string>(); var dic2 = new Dictionary <string, string>(); var dic3 = new Dictionary <string, string>(); var reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.dbpedia_redirect_file)); var line = ""; System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(@"_+"); System.Text.RegularExpressions.Regex deleteBrace = new System.Text.RegularExpressions.Regex(@"\(\w+\)"); while ((line = reader.ReadLine()) != null) { line = line.ToLower(); var array = line.Split('\t'); var source = deleteBrace.Replace(array[0], ""); source = regex.Replace(source, " ").Trim(); var des = deleteBrace.Replace(array[1], ""); des = regex.Replace(des, " ").Trim(); dic[source] = des; var source2 = deleteSpace.Replace(source, ""); var des2 = deleteSpace.Replace(des, ""); dic2[source2] = des2; dic3[source2] = des; } reader.Close(); redirects = dic; redirectsWithoutSpace = dic2; redirectsWithoutSpace2WithSpace = dic3; } } }
public static string StatisticItemNumberByType(String sourceFile) { FileReader reader = new LargeFileReader(sourceFile); Dictionary <string, int> NumByType = new Dictionary <string, int>(); string line; String[] array; int total = 0; while ((line = reader.ReadLine()) != null) { total++; array = line.Split('\t'); try { NumByType[array[1]] += 1; } catch (Exception) { NumByType[array[1]] = 1; } } reader.Close(); StringBuilder buffer = new StringBuilder(); foreach (String type in NumByType.Keys) { buffer.Append("\t" + type + "\t" + NumByType[type] + "\r"); } buffer.Append("\ttotal\t" + total); return(buffer.ToString()); }
/************************************************************************/ /* Statistic the co-occurrence rate between test data and train data * * /************************************************************************/ public static string StatisticCooccurrence(String trainFilePath, String testFilePath) { string path1 = trainFilePath; string path2 = testFilePath; FileReader reader = new LargeFileReader(path1); String line; HashSet <String> set = new HashSet <string>(); Dictionary <string, int> hitNumByType = new Dictionary <string, int>(); Dictionary <string, int> numByType = new Dictionary <string, int>(); string[] array; // store mentions of train data into a set while ((line = reader.ReadLine()) != null) { array = line.Split('\t'); set.Add(array[0]); } reader.Close(); reader.Open(path2); int total = 0; int coNum = 0; // get test data while ((line = reader.ReadLine()) != null) { total++; array = line.Split('\t'); if (set.Contains(line.Split('\t')[0])) { try { hitNumByType[array[1]] += 1; } catch (Exception) { hitNumByType[array[1]] = 1; } coNum++; } try { numByType[array[1]] += 1; } catch (Exception) { numByType[array[1]] = 1; } } reader.Close(); StringBuilder buffer = new StringBuilder(); foreach (String type in numByType.Keys) { buffer.Append("\t" + type + "\t" + (hitNumByType.ContainsKey(type) ? hitNumByType[type] : 0) + "\t" + numByType[type] + "\t" + 1.0 * (hitNumByType.ContainsKey(type) ? hitNumByType[type] : 0) / numByType[type] + "\r"); } buffer.Append("\ttotal coverage is: " + (1.0 * coNum / total)); return(buffer.ToString()); }
/************************************************************************/ /* Statistic the coverage of the dictionary * Note: all the entity is represent as lower case format*/ /************************************************************************/ public static string StatisticDicCoverage(String dicFile, String sourceFile) { string path1 = dicFile; string path2 = sourceFile; FileReader reader = new LargeFileReader(path1); String line; HashSet <String> set = new HashSet <string>(); Dictionary <string, int> hitNumByType = new Dictionary <string, int>(); Dictionary <string, int> NumByType = new Dictionary <string, int>(); String[] array; while ((line = reader.ReadLine()) != null) { set.Add(line.Split('\t')[0]); } reader.Close(); reader.Open(path2); int total = 0; int coNum = 0; while ((line = reader.ReadLine()) != null) { total++; array = line.Split('\t'); if (set.Contains(array[0].ToLower())) { coNum++; try { hitNumByType[array[1]] += 1; } catch (Exception) { hitNumByType[array[1]] = 1; } } try { NumByType[array[1]] += 1; } catch (Exception) { NumByType[array[1]] = 1; } } reader.Close(); Console.WriteLine("dic coverage rate is: " + 1.0 * coNum / total); StringBuilder buffer = new StringBuilder(); foreach (String type in NumByType.Keys) { buffer.Append("\t" + type + "\t" + (hitNumByType.ContainsKey(type) ? hitNumByType[type] : 0) + "\t" + NumByType[type] + "\t" + 1.0 * (hitNumByType.ContainsKey(type) ? hitNumByType[type] : 0) / NumByType[type] + "\r"); } return(buffer.ToString()); }
private static void LoadPreposition() { prepositions = new HashSet <String>(); FileReader reader = new LargeFileReader(); String line; while ((line = reader.ReadLine()) != null) { prepositions.Add(line); } }
/// <summary> /// Load Bayes Statistic Model /// </summary> /// <param name="modelFile"></param> /// <returns></returns> public static Dictionary <string, Dictionary <string, Dictionary <string, int> > > LoadModel(string modelFile) { var model = new Dictionary <string, Dictionary <string, Dictionary <string, int> > >(); FileReader reader = new LargeFileReader(modelFile); string line; var count = 0; var regex = new Regex(@"^\w"); Dictionary <string, Dictionary <string, int> > dicByField = null; var dicByValue = new Dictionary <string, int>(); while ((line = reader.ReadLine()) != null) { count++; if (regex.IsMatch(line)) { // get new label or feild var label = line; try { dicByField = model[label]; } catch (Exception) { dicByField = new Dictionary <string, Dictionary <string, int> >(); model[label] = dicByField; } var field = reader.ReadLine(); try { dicByValue = dicByField[field]; } catch (Exception) { dicByValue = new Dictionary <string, int>(); dicByField[field] = dicByValue; } } else { line = line.Trim(); var array = line.Split('\t'); if (array.Length != 2) { Console.WriteLine("Wrong Format in line" + count); continue; } dicByValue[array[0]] = int.Parse(array[1]); } } reader.Close(); return(model); }
private void LoadWeight(string weightFilePath) { w = new Dictionary <string, double>(); var reader = new LargeFileReader(weightFilePath); string line; while ((line = reader.ReadLine()) != null) { var array = line.Split('\t'); w[array[0]] = double.Parse(array[1]); } reader.Close(); }
/* Train file format: * Mention Type Context * Extract word table and word shape table from train data * Every word is converted to lowercase and stemmed * /************************************************************************/ public void ExtractWordTable() { FileReader reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.train_data_file)); FileWriter writer = new LargeFileWriter((string)GlobalParameter.Get(DefaultParameter.Field.word_table_file), FileMode.Create); FileWriter wordShapeWriter = new LargeFileWriter((string)GlobalParameter.Get(DefaultParameter.Field.word_shape_table_file), FileMode.Create); //FileWriter wordShapeWriter = new LargeFileWriter("../../../Fine-ner/input/shape-table-file.txt", FileMode.Create); string line = null; var wordTable = new HashSet <string>(); var wordShapeTable = new HashSet <string>(); while ((line = reader.ReadLine()) != null) { try { var array = line.Split('\t'); var tokenizer = TokenizerPool.GetTokenizer(); var words = tokenizer.Tokenize(array[2]); TokenizerPool.ReturnTokenizer(tokenizer); foreach (var w in words) { if (!string.IsNullOrEmpty(w)) // w should not be empty { var shape = Feature.GetWordShape(w); if (!wordShapeTable.Contains(shape)) { wordShapeWriter.WriteLine(shape); wordShapeTable.Add(shape); } var word = Generalizer.Generalize(w); if (!wordTable.Contains(word)) { writer.WriteLine(word); wordTable.Add(word); } } } } catch (Exception e) { Console.WriteLine("=================error!==============="); Console.WriteLine("\t" + e.Message); Console.WriteLine("\t" + e.StackTrace); Console.WriteLine("=================error!==============="); continue; } } reader.Close(); writer.Close(); }
private Dictionary <string, int> LoadTotalNumByType() { var dic = new Dictionary <string, int>(); var reader = new LargeFileReader(this.sourceFileInfoFile); string line; while ((line = reader.ReadLine()) != null) { var array = line.Split('\t'); dic[array[0]] = int.Parse(array[1]); } reader.Close(); return(dic); }
/// <summary> /// Compare the result of old model and the newest model /// </summary> /// <param name="resultFile1"></param> /// File path storing the old result /// <param name="resultFile2"></param> /// File path storing the new result public static string CompareResult(string resultFile1, string resultFile2) { var itemLabels = new HashSet <string>(); var positiveItemsInResultOne = new HashSet <string>(); var positiveItemsInResultTwo = new HashSet <string>(); var negtiveItemsInResultOne = new HashSet <string>(); var negtiveItemsInResultTwo = new HashSet <string>(); string line; FileReader reader = new LargeFileReader(resultFile1); while ((line = reader.ReadLine()) != null) { var array = line.Split('\t'); itemLabels.Add(array[0]); if (array[2].StartsWith(array[1])) { positiveItemsInResultOne.Add(array[0]); } else { negtiveItemsInResultOne.Add(array[0]); } } reader.Open(resultFile2); while ((line = reader.ReadLine()) != null) { var array = line.Split('\t'); if (array[2].StartsWith(array[1])) { positiveItemsInResultTwo.Add(array[0]); } else { negtiveItemsInResultTwo.Add(array[0]); } } reader.Close(); StringBuilder report = null; var pp = GetIntersection(positiveItemsInResultOne, positiveItemsInResultTwo).Count; var pn = GetIntersection(positiveItemsInResultOne, negtiveItemsInResultTwo).Count; var np = GetIntersection(negtiveItemsInResultOne, positiveItemsInResultTwo).Count; var nn = GetIntersection(negtiveItemsInResultOne, negtiveItemsInResultTwo).Count; report.Append("old|new | right | wrong\r"); report.Append(string.Format(" right | {0} | {1}\r"), pp, pn); report.Append(string.Format(" wrong | {0} | {1}\r"), np, nn); return(report.toString()); }
public static int GetDimension(string modelFilePath) { int count = 0; FileReader reader = new LargeFileReader(modelFilePath); string line; while ((line = reader.ReadLine()) != null) { if (line.Equals(BayesModel.END)) { count++; } } reader.Close(); return(count); }
public static void ExtractUIUC() { string source = @"E:\Users\v-mipeng\Data\Dictionary\name-list.freq.txt"; FileReader reader = new LargeFileReader(source); string des = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\data\name-list.txt"; FileWriter writer = new LargeFileWriter(des, FileMode.OpenOrCreate); String line; String[] array; while ((line = reader.ReadLine()) != null) { array = line.Split('\t'); writer.WriteLine(array[0]); } }
/// <summary> /// Combine files given by sourceFiles into one file given by desFile /// </summary> /// <param name="sourceFiles"> /// Source file pathes to be combined /// </param> /// <param name="desFile"> /// The file path to store the combined file /// </param> public static void CombineFiles(IEnumerable <string> sourceFiles, string desFile) { var reader = new LargeFileReader(); var writer = new LargeFileWriter(desFile, FileMode.Create); string line; foreach (var file in sourceFiles) { reader.Open(file); while ((line = reader.ReadLine()) != null) { writer.WriteLine(line); } } reader.Close(); writer.Close(); }
public void GetKeyWordInfo() { Console.WriteLine("Thread {0} start.", threadID); var reader = new LargeFileReader(source); var wordOccurNumDic = new Dictionary <string, int>(); var line = ""; var classNum = 0; var tagger = PosTaggerPool.GetPosTagger(); var set = new HashSet <string>(); while ((line = reader.ReadLine()) != null) { if (classNum > 10000) { break; } classNum++; if (classNum % 1000 == 0) { Console.WriteLine("Thread {0} has processed: {1}", threadID, classNum); } var array = line.Split('\t'); var pairs = tagger.TagString(array[3]); set.Clear(); foreach (var pair in pairs) { if (pair.second.StartsWith("N") || pair.second.StartsWith("V") || pair.second.StartsWith("J")) { var tokenStemmed = Generalizer.Generalize(pair.first).ToLower(); set.Add(tokenStemmed); } } foreach (var token in set) { int num = 0; wordOccurNumDic.TryGetValue(token, out num); wordOccurNumDic[token] = num + 1; } } reader.Close(); PosTaggerPool.ReturnPosTagger(tagger); KeyWordSelector.tuples[threadID] = new Tuple(classNum, wordOccurNumDic); }
// read word vectors from file private void LoadVectors() { if (this.dimension == 0) { this.dimension = GetVectorDimension(); } if (this.size == 0) { FileReader reader = new LargeFileReader(vectorFile); this.words = new List <string>(); var vectors = new List <double[]>(); string line; int index = 0; while ((line = reader.ReadLine()) != null) { line = line.Trim(); var array = line.Split(' '); if (array.Length != dimension + 1) { continue; } words.Add(array[0]); var vector = new double[dimension]; for (int i = 1; i < array.Length; i++) { vector[i - 1] = double.Parse(array[i]); } vectors.Add(vector); index++; } reader.Close(); this.size = vectors.Count; this.vectors = new double[this.size][]; for (int i = 0; i < this.size; i++) { this.vectors[i] = vectors[i]; } } else { LoadVectors(this.size, this.dimension); } }
/// <summary> /// Refine disambiguations file download from dbpedia /// </summary> /// <param name="sourceFile"></param> /// <param name="desFile"></param> public static void RefineAmbiguousItem(string sourceFile, string desFile) { var reader = new LargeFileReader(sourceFile); var writer = new LargeFileWriter(desFile, System.IO.FileMode.Create); var line = ""; System.Text.RegularExpressions.Regex sourceRegex = new System.Text.RegularExpressions.Regex(@"/([^/>]+)>"); System.Text.RegularExpressions.Regex deleteBraceRegex = new System.Text.RegularExpressions.Regex(@"_?\([^\)]+\)"); System.Text.RegularExpressions.Regex desRegex = new System.Text.RegularExpressions.Regex(@"/([^/>]+)>\s\.$"); var dic = new Dictionary <string, List <string> >(300000); List <string> list = null; reader.ReadLine(); while ((line = reader.ReadLine()) != null) { var sourceMatch = sourceRegex.Match(line); var source = sourceMatch.Groups[1].Value; source = deleteBraceRegex.Replace(source, ""); var desMatch = desRegex.Match(line); if (dic.TryGetValue(source, out list)) { list.Add(desMatch.Groups[1].Value); } else { list = new List <string>(); list.Add(desMatch.Groups[1].Value); dic[source] = list; } } reader.Close(); foreach (var item in dic) { writer.Write(item.Key); foreach (var des in item.Value) { writer.Write("\t" + des); } writer.WriteLine(""); } writer.Close(); }
public static void LoadDBpedia() { lock (dbpediaDicLocker) { if (dbpediaEntity2Type == null) { var dic = new Dictionary <string, object>(); object types = null; var reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.dbpedia_dic_file)); var line = ""; System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(@"_+"); System.Text.RegularExpressions.Regex deleteBrace = new System.Text.RegularExpressions.Regex(@"\(\w+\)"); while ((line = reader.ReadLine()) != null) { line = line.ToLower(); var array = line.Split('\t'); var entity = deleteBrace.Replace(array[0], ""); entity = regex.Replace(entity, "").Trim(); // does not contains space if (dic.TryGetValue(entity, out types)) { if (types.GetType().Equals(typeof(string))) { var set = new HashSet <string>(); set.Add((string)types); set.Add(array[1]); dic[entity] = set; } else { ((HashSet <string>)types).Add(array[1]); } } else { dic[entity] = array[1]; } } reader.Close(); dbpediaEntity2Type = dic; } } }