コード例 #1
0
ファイル: DataCenter.cs プロジェクト: v-mipeng/EntityTyping
        public static void RefreshStemDic(string des = null)
        {
            lock (stemmerLocker)
            {
                if (stemWordDic != null)
                {
                    if (des == null)
                    {
                        des = (string)DefaultParameter.Get(DefaultParameter.Field.stem_map);
                        //des = (string)GlobalParameter.Get(DefaultParameter.stem_map);
                    }
                    if (stemWordDic == null)
                    {
                        return;
                    }
                    FileWriter writer = new LargeFileWriter(des, FileMode.Create);

                    foreach (var word in stemWordDic.Keys)
                    {
                        writer.WriteLine(word + "\t" + stemWordDic[word]);
                    }
                    writer.Close();
                    stemWordDic = null;
                }
            }
        }
コード例 #2
0
        static void Temp()
        {
            var sourceDir = @"D:\Codes\Project\EntityTyping\Fine-ner\input\tmp\";
            var des       = @"D:\Codes\Project\EntityTyping\Fine-ner\input\keywords.txt";
            var files     = Directory.GetFiles(sourceDir);
            var reader    = new LargeFileReader();
            var writer    = new LargeFileWriter(des, FileMode.Create);
            var line      = "";
            var keyWords  = new HashSet <string>();

            foreach (var file in files)
            {
                reader.Open(file);
                int count = 0;
                while ((line = reader.ReadLine()) != null)
                {
                    count++;
                    if (count > 100)
                    {
                        break;
                    }
                    var array = line.Split('\t');
                    keyWords.Add(array[0]);
                }
            }
            reader.Close();
            foreach (var word in keyWords)
            {
                writer.WriteLine(word);
            }
            writer.Close();
        }
コード例 #3
0
ファイル: Test.cs プロジェクト: v-mipeng/EntityTyping
        public static void Temp4()
        {
            var    source = @"D:\Codes\Project\EntityTyping\Fine-ner\input\dictionaries\dbpedia\dbpedia entity type.txt";
            var    reader = new pml.file.reader.LargeFileReader(source);
            var    des    = @"D:\Codes\Project\EntityTyping\Fine-ner\input\dictionaries\dbpedia\tmp.txt";
            var    writer = new LargeFileWriter(des, FileMode.Create);
            string line;
            int    count = 0;
            var    set   = new HashSet <string>();
            var    dic   = new Dictionary <string, int>();
            var    times = 0;

            while ((line = reader.ReadLine()) != null)
            {
                if (++count % 10000 == 0)
                {
                    Console.WriteLine(count);
                }
                var array = line.Split('\t');
                dic.TryGetValue(array[1], out times);
                dic[array[1]] = times + 1;
            }
            reader.Close();
            foreach (var type in dic.OrderByDescending(key => key.Value))
            {
                writer.WriteLine(type.Key + "\t" + type.Value);
            }
            writer.Close();
        }
コード例 #4
0
ファイル: Word2Vec.cs プロジェクト: v-mipeng/EntityTyping
        static void  ParseWord2Vec(string word2vecFile, string parsedFile)
        {
            var writer = new LargeFileWriter(parsedFile, FileMode.Create);
            var parser = new ParseBinaryVector(word2vecFile);
            int count  = 0;

            while (!parser.EOF)
            {
                if (++count % 1000 == 0)
                {
                    Console.WriteLine(count);
                }
                try
                {
                    var pair  = parser.GetNextVector();
                    var index = pair.first.IndexOf("en/");
                    writer.Write(pair.first.Substring(index + 3));
                    foreach (var value in pair.second)
                    {
                        writer.Write(string.Format(" {0}", value));
                    }
                    writer.Write("\r");
                }
                catch (Exception)
                {
                    continue;
                }
            }
            writer.Close();
        }
コード例 #5
0
ファイル: Test.cs プロジェクト: v-mipeng/EntityTyping
        public static void Temp5()
        {
            var sourceDir   = @"D:\Codes\Project\EntityTyping\Fine-ner\input\feature\test";
            var sourceFiles = Directory.GetFiles(sourceDir).ToList();
            var desFile     = @"D:\Codes\Project\EntityTyping\Fine-ner\input\feature\test data in dbpedia info.txt";
            var writer      = new LargeFileWriter(desFile, FileMode.Create);

            for (var i = 0; i < sourceFiles.Count; i++)
            {
                var reader = new EventReaderByLine(sourceFiles[i]);
                int count  = 0;
                int total  = 0;
                while (reader.HasNext())
                {
                    total++;
                    var event1     = reader.GetNextEvent();
                    var rawFeature = event1.Feature.ToList();
                    if (!rawFeature[(int)Event.Field.dbpediaTypes].Equals("UNKNOW"))
                    {
                        count++;
                    }
                }
                reader.Close();
                writer.WriteLine(Path.GetFileNameWithoutExtension(sourceFiles[i]) + "\t" + count + "\t" + (1.0 * count / total));
            }
            writer.Close();
        }
コード例 #6
0
ファイル: TfIdf.cs プロジェクト: v-mipeng/EntityTyping
        private void OutputTfIdf()
        {
            var writer   = new LargeFileWriter(vectorPath, FileMode.Create);
            int docLabel = 1;
            int num      = 0;

            ReadOneDoc();

            while (this.doc != null)
            {
                var tokenizer = TokenizerPool.GetTokenizer();

                var document = tokenizer.Tokenize(doc);
                TokenizerPool.ReturnTokenizer(tokenizer);
                if (++num % 1000 == 0)
                {
                    Console.WriteLine(num);
                }
                var vector = GetTfIdf(document);
                writer.Write(docLabel);
                foreach (var value in vector)
                {
                    writer.Write("\t" + value.first + ":" + value.second);
                }
                writer.Write("\r");
                ReadOneDoc();
                docLabel++;
            }
            writer.Close();
        }
コード例 #7
0
ファイル: Pipeline.cs プロジェクト: v-mipeng/EntityTyping
        /// <summary>
        ///    Extract features for bayes model
        /// </summary>
        /// <param name="source">
        ///    File path storing the data from which this program extract features.
        /// </param>
        /// <param name="des">
        ///    File path to store the extracted features.
        /// </param>
        private static void ExtractBayesFeature(string source, string des)
        {
            FileReader reader       = new LargeFileReader(source);
            FileWriter writer       = new LargeFileWriter(des, FileMode.Create);
            var        lines        = reader.ReadAllLines().ToList();
            const int  numPerThread = 10000;
            var        threadNum    = (int)Math.Ceiling(1.0 * lines.Count / numPerThread);
            var        childThreads = new Thread[threadNum];
            var        tmpFiles     = new string[threadNum];

            for (var i = 0; i < threadNum; i++)
            {
                tmpFiles[i] = "./tmp" + i + ".txt";
                var threadClass = new BayesFeatureThread(lines.GetRange(numPerThread * i, Math.Min(numPerThread, lines.Count - numPerThread * i)), tmpFiles[i]);
                childThreads[i]      = new Thread(threadClass.ThreadMain);
                childThreads[i].Name = "thread " + i;
                childThreads[i].Start();
            }
            for (var i = 0; i < threadNum; i++)
            {
                childThreads[i].Join();
            }
            foreach (var tmpFile in tmpFiles)
            {
                var text = File.ReadAllText(tmpFile);
                writer.Write(text);
                File.Delete(tmpFile);
            }
        }
コード例 #8
0
        // save word and cluster id pairs
        private void SaveWordClusterId()
        {
            var writer = new LargeFileWriter(wordClusterIDFile, FileMode.Create);

            for (int i = 0; i < words.Count; i++)
            {
                writer.WriteLine(words[i] + "\t" + labels[i]);
            }
            writer.Close();
        }
コード例 #9
0
ファイル: TfIdf.cs プロジェクト: v-mipeng/EntityTyping
        private void SaveWordTable()
        {
            var writer = new LargeFileWriter(wordTablePath, FileMode.Create);

            foreach (var word in wordTable.Keys)
            {
                writer.WriteLine(word + "\t" + wordTable[word]);
            }
            writer.Close();
        }
コード例 #10
0
ファイル: TfIdf.cs プロジェクト: v-mipeng/EntityTyping
        private void SaveDf()
        {
            var writer = new LargeFileWriter(dfPath, FileMode.Create);

            foreach (var word in df.Keys)
            {
                writer.WriteLine(word + "\t" + df[word]);
            }
            writer.Close();
        }
コード例 #11
0
ファイル: Pipeline.cs プロジェクト: v-mipeng/EntityTyping
            public void ThreadMain()
            {
                FileWriter writer = new LargeFileWriter(this.des, FileMode.Create);
                var        count  = 0;

                foreach (var line in this.lines)
                {
                    if ((++count) % 1000 == 0)
                    {
                        Console.WriteLine(Thread.CurrentThread.Name + " has processed " + count);
                    }
                    try
                    {
                        var feature = ExtractBayesFeature(line);
                        writer.Cache(feature.first);
                        var dic = feature.second;
                        foreach (var field in dic.Keys)
                        {
                            writer.Cache("\t" + field + ":{");
                            var values = dic[field];
                            if (values is IEnumerable && !(values is string))
                            {
                                var begin = true;
                                foreach (var value in (IEnumerable)values)
                                {
                                    if (begin)
                                    {
                                        writer.Cache(value);
                                        begin = false;
                                    }
                                    else
                                    {
                                        writer.Cache("," + value);
                                    }
                                }
                                writer.Cache("}");
                            }
                            else
                            {
                                writer.Cache(values + "}");
                            }
                        }
                        writer.Cache("\r");
                        writer.WriteCache();
                    }
                    catch (Exception e)
                    {
                        Console.WriteLine(line);
                        Console.WriteLine(e.Message);
                        writer.ClearCache();
                    }
                }
                writer.Close();
            }
コード例 #12
0
        public static void Analyse()
        {
            // String basedir = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input";
            String     reportFile = @" E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\report.txt";
            FileWriter writer     = new LargeFileWriter(reportFile, FileMode.Append);
            //// statistic train data number by type and coverage of UIUC by type
            //String dicCovReport = Statistic.StatisticDicCoverage(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\UIUC.txt", basedir + @"\train\train.txt");
            //writer.WriteLine("Coverage of UIUC within train data:\r"+dicCovReport);
            //writer.WriteLine("");
            //basedir = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\satori";
            //// statistic satori develop data number by type and coverage of UIUC by type
            //dicCovReport = Statistic.StatisticDicCoverage(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\UIUC.txt", basedir + @"\develop.txt");
            //writer.WriteLine("Coverage of UIUC within satori develop data:\r" + dicCovReport);
            //writer.WriteLine("");
            //// statistic satori test data number by type and coverage of UIUC by type
            //dicCovReport = Statistic.StatisticDicCoverage(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\UIUC.txt", basedir + @"\test.txt");
            //writer.WriteLine("Coverage of UIUC within satori test data:\r" + dicCovReport);
            //writer.WriteLine("");
            //basedir = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\satori_lc";
            //// statistic satori_lc develop data number by type and coverage of UIUC by type
            //dicCovReport = Statistic.StatisticDicCoverage(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\UIUC.txt", basedir + @"\develop.txt");
            //writer.WriteLine("Coverage of UIUC within satori_lc develop data:\r" + dicCovReport);
            //writer.WriteLine("");
            //// statistic satori_lc test data number by type and coverage of UIUC by type
            //dicCovReport = Statistic.StatisticDicCoverage(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\UIUC.txt", basedir + @"\test.txt");
            //writer.WriteLine("Coverage of UIUC within satori_lc test data:\r" + dicCovReport);
            //writer.WriteLine("");

            string basedir = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input";

            // // statistic co-occurrence rate between train and satori develop data
            //string corate = Statistic.StatisticCooccurrence(basedir + @"\train\train.txt", basedir + "/satori/develop.txt");
            //writer.WriteLine("Co-occurrence rate between train and satori develop data is:\r "+corate);
            //writer.WriteLine("");
            //// statistic co-occurrence rate between train and satori test data
            //corate = Statistic.StatisticCooccurrence(basedir + @"\train\train.txt", basedir + "/satori/test.txt");
            //writer.WriteLine("Co-occurrence rate between train and satori test data is:\r " + corate);
            //writer.WriteLine("");
            //// statistic co-occurrence rate between train and satori_lc develop data
            //corate = Statistic.StatisticCooccurrence(basedir + @"\train\train.txt", basedir + "/satori_lc/develop.txt");
            //writer.WriteLine("Co-occurrence rate between train and satori_lc develop data is:\r " + corate);
            //writer.WriteLine("");
            //// statistic co-occurrence rate between train and satori_lc test data
            //corate = Statistic.StatisticCooccurrence(basedir + @"\train\train.txt", basedir + "/satori_lc/test.txt");
            //writer.WriteLine("co-occurrence rate between train and satori_lc test data is:\r " + corate);
            //statistic name list coverage
            //String report = Statistic.StatisticNameListCoverageByType(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\name-list.txt", @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\train\limited train.txt");
            //writer.WriteLine("Name list coverage by type is :\r " + report);
            // statistic item number by type
            //writer.WriteLine("Item number by type:\r" + Statistic.StatisticItemNumberByType(basedir + @"\train\train.txt"));
            //writer.WriteLine(Statistic.StatisticRoundTokenInformation(basedir + @"\train\train.txt"));
            //writer.WriteLine(Statistic.StatisticWithinTokenInfomation(basedir + @"\train\train.txt"));
            writer.Close();
        }
コード例 #13
0
ファイル: Pipeline.cs プロジェクト: v-mipeng/EntityTyping
        /* Train file format:
         *      Mention     Type    Context
         * Extract word table and word shape table from train data
         * Every word is converted to lowercase and stemmed
         * /************************************************************************/
        public void ExtractWordTable()
        {
            FileReader reader          = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.train_data_file));
            FileWriter writer          = new LargeFileWriter((string)GlobalParameter.Get(DefaultParameter.Field.word_table_file), FileMode.Create);
            FileWriter wordShapeWriter = new LargeFileWriter((string)GlobalParameter.Get(DefaultParameter.Field.word_shape_table_file), FileMode.Create);
            //FileWriter wordShapeWriter = new LargeFileWriter("../../../Fine-ner/input/shape-table-file.txt", FileMode.Create);

            string line           = null;
            var    wordTable      = new HashSet <string>();
            var    wordShapeTable = new HashSet <string>();

            while ((line = reader.ReadLine()) != null)
            {
                try
                {
                    var array     = line.Split('\t');
                    var tokenizer = TokenizerPool.GetTokenizer();
                    var words     = tokenizer.Tokenize(array[2]);
                    TokenizerPool.ReturnTokenizer(tokenizer);
                    foreach (var w in words)
                    {
                        if (!string.IsNullOrEmpty(w))   // w should not be empty
                        {
                            var shape = Feature.GetWordShape(w);
                            if (!wordShapeTable.Contains(shape))
                            {
                                wordShapeWriter.WriteLine(shape);
                                wordShapeTable.Add(shape);
                            }
                            var word = Generalizer.Generalize(w);
                            if (!wordTable.Contains(word))
                            {
                                writer.WriteLine(word);
                                wordTable.Add(word);
                            }
                        }
                    }
                }
                catch (Exception e)
                {
                    Console.WriteLine("=================error!===============");
                    Console.WriteLine("\t" + e.Message);
                    Console.WriteLine("\t" + e.StackTrace);
                    Console.WriteLine("=================error!===============");
                    continue;
                }
            }
            reader.Close();
            writer.Close();
        }
コード例 #14
0
        // save the centroid of clusters
        private void SaveCentroids()
        {
            var writer = new LargeFileWriter(centroidInfoFile, FileMode.Create);

            //foreach (var centroid in kmeans.Clusters.Centroids)
            foreach (var centroid in kmeans.Centroids)
            {
                foreach (var value in centroid)
                {
                    writer.Write(string.Format("{0}\t", value));
                }
                writer.Write("\r");
            }
            writer.Close();
        }
コード例 #15
0
        public static void ExtractUIUC()
        {
            string     source = @"E:\Users\v-mipeng\Data\Dictionary\name-list.freq.txt";
            FileReader reader = new LargeFileReader(source);
            string     des    = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\data\name-list.txt";
            FileWriter writer = new LargeFileWriter(des, FileMode.OpenOrCreate);
            String     line;

            String[] array;

            while ((line = reader.ReadLine()) != null)
            {
                array = line.Split('\t');
                writer.WriteLine(array[0]);
            }
        }
コード例 #16
0
            private void SaveKeyWords(Dictionary <string, double> keyWordDic, string des)
            {
                var writer = new LargeFileWriter(des, FileMode.Create);
                int count  = 0;

                foreach (var item in keyWordDic.OrderByDescending(key => key.Value))
                {
                    count++;
                    writer.WriteLine(item.Key + "\t" + item.Value);
                    if (count > 1000)
                    {
                        break;
                    }
                }
                writer.Close();
            }
コード例 #17
0
ファイル: Util.cs プロジェクト: v-mipeng/EntityTyping
        /// <summary>
        /// Combine files given by sourceFiles into one file given by desFile
        /// </summary>
        /// <param name="sourceFiles">
        /// Source file pathes to be combined
        /// </param>
        /// <param name="desFile">
        /// The file path to store the combined file
        /// </param>
        public static void CombineFiles(IEnumerable <string> sourceFiles, string desFile)
        {
            var    reader = new LargeFileReader();
            var    writer = new LargeFileWriter(desFile, FileMode.Create);
            string line;

            foreach (var file in sourceFiles)
            {
                reader.Open(file);
                while ((line = reader.ReadLine()) != null)
                {
                    writer.WriteLine(line);
                }
            }
            reader.Close();
            writer.Close();
        }
コード例 #18
0
ファイル: Pipeline.cs プロジェクト: v-mipeng/EntityTyping
        private void OutputDicTypeValue()
        {
            var dic    = DataCenter.GetDicTyeMap();
            var writer = new LargeFileWriter((string)GlobalParameter.Get(DefaultParameter.Field.dic_type_value_file), FileMode.OpenOrCreate);

            foreach (var key in dic.Keys)
            {
                if (GlobalParameter.featureNum != 0)
                {
                    writer.WriteLine(key + "\t" + (GlobalParameter.featureNum - DataCenter.GetDicTypeNum() + dic[key]));
                }
                else
                {
                    writer.WriteLine(key + "\t" + dic[key]);
                }
            }
            writer.Close();
        }
コード例 #19
0
        /// <summary>
        /// Refine disambiguations file download from dbpedia
        /// </summary>
        /// <param name="sourceFile"></param>
        /// <param name="desFile"></param>
        public static void RefineAmbiguousItem(string sourceFile, string desFile)
        {
            var reader = new LargeFileReader(sourceFile);
            var writer = new LargeFileWriter(desFile, System.IO.FileMode.Create);
            var line   = "";

            System.Text.RegularExpressions.Regex sourceRegex      = new System.Text.RegularExpressions.Regex(@"/([^/>]+)>");
            System.Text.RegularExpressions.Regex deleteBraceRegex = new System.Text.RegularExpressions.Regex(@"_?\([^\)]+\)");

            System.Text.RegularExpressions.Regex desRegex = new System.Text.RegularExpressions.Regex(@"/([^/>]+)>\s\.$");
            var           dic  = new Dictionary <string, List <string> >(300000);
            List <string> list = null;

            reader.ReadLine();

            while ((line = reader.ReadLine()) != null)
            {
                var sourceMatch = sourceRegex.Match(line);
                var source      = sourceMatch.Groups[1].Value;
                source = deleteBraceRegex.Replace(source, "");
                var desMatch = desRegex.Match(line);
                if (dic.TryGetValue(source, out list))
                {
                    list.Add(desMatch.Groups[1].Value);
                }
                else
                {
                    list = new List <string>();
                    list.Add(desMatch.Groups[1].Value);
                    dic[source] = list;
                }
            }
            reader.Close();
            foreach (var item in dic)
            {
                writer.Write(item.Key);
                foreach (var des in item.Value)
                {
                    writer.Write("\t" + des);
                }
                writer.WriteLine("");
            }
            writer.Close();
        }
コード例 #20
0
        public void Train()
        {
            this.statisticModel  = BayesTest.LoadModel(@"D:\Codes\C#\EntityTyping\Fine-ner\unit test\output\model.txt");
            this.developFeatures = LoadBayesData(@"D:\Codes\C#\EntityTyping\Fine-ner\unit test\output\developFeature.txt");
            this.labels          = GetLabels().ToList();
            this.w = new Dictionary <string, double>(this.labels.Count);
            foreach (var label in labels)  // Initial weight vector
            {
                w[label] = 1;
            }
            var learnSpeed   = 0.005;
            var lastPositive = -1;

            this.positive = -1;
            int steps      = 0;
            var lastWeight = new Dictionary <string, double>(w);

            while (lastPositive == -1 || ((this.positive - lastPositive) > 0))
            {
                steps++;
                //learnSpeed /= steps;
                lastPositive = this.positive;
                var diff = GetDiff();
                lastWeight = new Dictionary <string, double>(w);
                foreach (var label in labels)
                {
                    w[label] -= diff[label] * learnSpeed;
                }
                Console.WriteLine("positive: " + positive);
                foreach (var label in labels)
                {
                    Console.Write(label + "\t" + w[label] + "\t");
                }
                Console.WriteLine("");
            }
            FileWriter writer = new LargeFileWriter(@"D:\Codes\C#\EntityTyping\Fine-ner\unit test\output\weight.txt", FileMode.Create);

            foreach (var label in labels)
            {
                writer.WriteLine(label + "\t" + lastWeight[label]);
            }
            writer.Close();
        }
コード例 #21
0
ファイル: Test.cs プロジェクト: v-mipeng/EntityTyping
        public static void Temp6()
        {
            var sourceFile     = @"D:\Codes\Project\EntityTyping\Fine-ner\output\conll feature\raw\train.txt";
            var desFile        = @"D:\Codes\Project\EntityTyping\Fine-ner\output\conll feature\raw\train data in dbpedia info.txt";
            var writer         = new LargeFileWriter(desFile, FileMode.Create);
            var coverNumByType = new Dictionary <string, int>();
            var totals         = new Dictionary <string, int>();

            var reader = new EventReaderByLine(sourceFile);

            while (reader.HasNext())
            {
                var event1     = reader.GetNextEvent();
                var rawFeature = event1.Feature.ToList();
                try
                {
                    totals[event1.Label.ToString()] += 1;
                }
                catch (Exception)
                {
                    totals[event1.Label.ToString()] = 1;
                }
                if (!rawFeature[(int)Event.Field.dbpediaTypes].Equals("UNKNOW"))
                {
                    try
                    {
                        coverNumByType[event1.Label.ToString()] += 1;
                    }
                    catch (Exception)
                    {
                        coverNumByType[event1.Label.ToString()] = 1;
                    }
                }
            }
            reader.Close();
            foreach (var type in totals.Keys)
            {
                writer.WriteLine(type + "\t" + coverNumByType[type] + "\t" + totals[type] + "\t" + (1.0 * coverNumByType[type] / totals[type]));
            }
            writer.Close();
        }
コード例 #22
0
ファイル: Word2Vec.cs プロジェクト: v-mipeng/EntityTyping
        public static void SelectInterestWordVector(string interestWordFile, string word2vecFile, string compressedWord2VectorFile)
        {
            var    reader = new LargeFileReader(interestWordFile);
            string line;
            var    set = new HashSet <string>();

            while ((line = reader.ReadLine()) != null)
            {
                set.Add(line.Trim());
            }
            reader.Close();
            var writer = new LargeFileWriter(compressedWord2VectorFile, FileMode.Create);
            var parser = new ParseBinaryVector(word2vecFile);
            int count  = 0;

            while (!parser.EOF)
            {
                if (++count % 1000 == 0)
                {
                    Console.WriteLine(count);
                }
                try
                {
                    var pair = parser.GetNextVector();
                    if (set.Contains(pair.first))
                    {
                        writer.Write(pair.first);
                        foreach (var value in pair.second)
                        {
                            writer.Write(string.Format(" {0}", value));
                        }
                        writer.Write("\r");
                    }
                }
                catch (Exception)
                {
                    continue;
                }
            }
            writer.Close();
        }
コード例 #23
0
ファイル: Test.cs プロジェクト: v-mipeng/EntityTyping
        public static void Temp()
        {
            var source = @"D:\Data\DBpedia\mapping based types";

            var    desDir = "";
            var    dic    = new Dictionary <string, FileWriter>();
            var    reader = new pml.file.reader.LargeFileReader(source);
            var    des    = @"D:\Data\DBpedia\entity type pairs.txt";
            var    writer = new LargeFileWriter(des, FileMode.Create);
            string line;

            System.Text.RegularExpressions.Regex entityRegex = new System.Text.RegularExpressions.Regex(@"/([^>/]+)>\s<");
            System.Text.RegularExpressions.Regex typeRegex   = new System.Text.RegularExpressions.Regex(@"ontology/(\w+)>\s\.$");
            int count = 0;

            while ((line = reader.ReadLine()) != null)
            {
                string entity = null;
                string type   = null;
                if (entityRegex.IsMatch(line))
                {
                    var match = entityRegex.Match(line);
                    entity = match.Groups[1].Value;
                }
                if (typeRegex.IsMatch(line))
                {
                    var match = typeRegex.Match(line);
                    type = match.Groups[1].Value;
                }
                if (entity != null && type != null)
                {
                    if (++count % 10000 == 0)
                    {
                        Console.WriteLine(count);
                    }
                    writer.WriteLine(entity + "\t" + type);
                }
            }
            reader.Close();
            writer.Close();
        }
コード例 #24
0
        /// <summary>
        ///        Store the trained model into file
        /// </summary>
        /// <param name="desPath"></param>
        /// <param name="array"></param>
        /// <format>
        ///     [Label]
        ///     [field name]
        ///     TAB [feature annotation,e.g., last word]    TAB     [times]
        ///     TAB [feature annotation,e.g., last word]    TAB     [times]
        ///     ###END###
        /// </format>
        internal static void OutputModel(string desPath, object model)
        {
            var writer = new LargeFileWriter(desPath, FileMode.Create);
            var dics   = (Dictionary <string, Dictionary <string, Dictionary <string, int> > >)model;

            foreach (var label in dics.Keys) // Check !
            {
                var dic = dics[label];       // fields-->dic<feature value, times>
                foreach (var field in dic.Keys)
                {
                    writer.WriteLine(label);
                    writer.WriteLine(field);     // write field
                    var featureDic = dic[field];
                    var keys       = Feature.SortKeysByNum(featureDic);
                    foreach (var featureValue in keys)
                    {
                        writer.WriteLine("\t" + featureValue + "\t" + featureDic[featureValue]);
                    }
                }
            }
            writer.Close();
        }
コード例 #25
0
ファイル: Test.cs プロジェクト: v-mipeng/EntityTyping
        public static void GetItemNumByType()
        {
            var sourceDir   = @"D:\Codes\Project\EntityTyping\Fine-ner\input\feature\test";
            var desFile     = @"D:\Codes\Project\EntityTyping\Fine-ner\input\feature\data info.txt";
            var sourceFiles = Directory.GetFiles(sourceDir).ToList();
            var writer      = new LargeFileWriter(desFile, FileMode.Append);

            writer.WriteLine(sourceDir.Substring(sourceDir.LastIndexOf("\\") + 1) + ":");
            for (var i = 0; i < sourceFiles.Count; i++)
            {
                var reader = new LargeFileReader(sourceFiles[i]);
                var line   = "";
                int count  = 0;
                while ((line = reader.ReadLine()) != null)
                {
                    count++;
                }
                reader.Close();
                writer.WriteLine(Path.GetFileNameWithoutExtension(sourceFiles[i]) + "\t:\t" + count);
            }
            writer.Close();
        }
コード例 #26
0
ファイル: Test.cs プロジェクト: v-mipeng/EntityTyping
        public static void Temp3()
        {
            var    source = @"D:\Data\DBpedia\redirects.ttl";
            var    reader = new pml.file.reader.LargeFileReader(source);
            var    des    = @"D:\Data\DBpedia\redirects.txt";
            var    writer = new LargeFileWriter(des, FileMode.Create);
            string line;

            System.Text.RegularExpressions.Regex firstRegex  = new System.Text.RegularExpressions.Regex(@"/([^>/]+)>\s<");
            System.Text.RegularExpressions.Regex secondRegex = new System.Text.RegularExpressions.Regex(@"/(\w+)>\s\.$");
            int count = 0;

            while ((line = reader.ReadLine()) != null)
            {
                string first  = null;
                string second = null;

                if (firstRegex.IsMatch(line))
                {
                    var match = firstRegex.Match(line);
                    first = match.Groups[1].Value;
                }
                if (secondRegex.IsMatch(line))
                {
                    var match = secondRegex.Match(line);
                    second = match.Groups[1].Value;
                }
                if (first != null && second != null)
                {
                    if (++count % 10000 == 0)
                    {
                        Console.WriteLine(count);
                    }
                    writer.WriteLine(first + "\t" + second);
                }
            }
            reader.Close();
            writer.Close();
        }
コード例 #27
0
        public static void Temp()
        {
            if (false)
            {
                var reader = new LargeFileReader(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\train\train.txt");
                var writer = new LargeFileWriter(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\train\limited train.txt", FileMode.OpenOrCreate);
                Dictionary <string, int> numByType = new Dictionary <string, int>(16);
                String   line;
                String[] array;
                int      count = 0;
                int      num   = 0;
                while ((line = reader.ReadLine()) != null)
                {
                    count++;
                    if (count % 1000 == 0)
                    {
                        Console.Error.WriteLine(count + " items processed!");
                    }
                    array = line.Split('\t');
                    try
                    {
                        num = numByType[array[1]];
                    }
                    catch (Exception)
                    {
                        num = 0;
                    }
                    if (num > 100000)   // do not limit train data number by type
                    {
                        continue;
                    }
                    writer.WriteLine(line);
                    numByType[array[1]] = ++num;
                }
                reader.Close();
                writer.Close();
            }

            if (false)
            {
                string     result  = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\result\satori_lc\-1.inst.txt";
                string     source  = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\satori_lc\develop.txt";
                String     tmpFile = "./tmp.txt";
                FileReader reader1 = new LargeFileReader(result);
                FileReader reader2 = new LargeFileReader(source);
                FileWriter writer  = new LargeFileWriter(tmpFile, FileMode.OpenOrCreate);
                String     line;
                String     line2;


                writer.WriteLine(reader1.ReadLine());

                while ((line = reader1.ReadLine()) != null)
                {
                    line2 = reader2.ReadLine();
                    writer.WriteLine(line2.Split('\t')[0] + "\t" + line.Split(new char[] { '\t' }, 2)[1]);
                }
                reader1.Close();
                reader2.Close();
                writer.Close();
                File.Copy(tmpFile, @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\result\satori_lc\.inst.txt");
                File.Delete(tmpFile);
            }
            if (false)
            {
                string           wordTableFile = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\word table\wordTable.txt";
                FileReader       reader        = new LargeFileReader(wordTableFile);
                FileWriter       writer        = new LargeFileWriter();
                HashSet <string> wordSet       = new HashSet <string>();
                string           line;

                while ((line = reader.ReadLine()) != null)
                {
                    //var stemmer = StemmerPool.GetStemmer();
                    //wordSet.Add(stemmer.Stem(line.Split('\t')[0])[0]);
                    //StemmerPool.ReturnStemmer(stemmer);
                    //stemmer = null;
                }
                reader.Close();
                writer.Open(wordTableFile);
                int i = 0;
                foreach (String word in wordSet)
                {
                    writer.WriteLine(word + '\t' + (i++));
                }
                writer.Close();
            }
            if (false)
            {
                String     dir    = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\names";
                string     des    = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\name-all.txt";
                FileReader reader = new LargeFileReader();
                FileWriter writer = new LargeFileWriter(des, FileMode.Create);
                string[]   files  = Directory.GetFiles(dir, "*.txt");
                string     line;

                foreach (String file in files)
                {
                    reader.Open(file);
                    while ((line = reader.ReadLine()) != null)
                    {
                        writer.WriteLine(line.Split(',')[0]);
                    }
                }
                reader.Close();
                writer.Close();
            }
            if (false)
            {
                string           path1  = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\name-all.txt";
                string           path2  = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\train\limited train.txt";
                string           des    = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\tmp.txt";
                FileReader       reader = new LargeFileReader(path1);
                FileWriter       writer = new LargeFileWriter(des);
                String           line;
                HashSet <String> set = new HashSet <string>();
                String[]         array;

                while ((line = reader.ReadLine()) != null)
                {
                    set.Add(line);
                    array = line.Split(' ');
                }
                reader.Close();
                reader.Open(path2);

                while ((line = reader.ReadLine()) != null)
                {
                    array = line.Split('\t');

                    if (set.Contains(array[0].ToLower()))
                    {
                        if (!array[1].Equals("people.person"))
                        {
                            set.Remove(array[0].ToLower());
                        }
                    }
                }
                reader.Close();
                foreach (String name in set)
                {
                    writer.WriteLine(name);
                }
                writer.Close();
            }
            if (false)
            {
                FileReader       reader = new LargeFileReader(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\satori_lc\backup\version 1-2\develop.txt");
                FileWriter       writer = new LargeFileWriter(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\satori_lc\develop.txt", FileMode.OpenOrCreate);
                String           line;
                string[]         array;
                HashSet <string> interestTypes = new HashSet <string>();
                interestTypes.Add("people.person");
                interestTypes.Add("location.location");
                interestTypes.Add("organization.organization");
                while ((line = reader.ReadLine()) != null)
                {
                    array = line.Split('\t');
                    if (interestTypes.Contains(array[1]))
                    {
                        writer.WriteLine(line);
                    }
                }
                reader.Close();
                writer.Close();
            }
            if (false)
            {
                FileReader reader = new LargeFileReader(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\tmp.txt");
                FileWriter writer = new LargeFileWriter(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\tmp2.txt", FileMode.Create);
                String     line;
                string[]   array;
                string[]   pairString;
                List <Pair <string, int> >     list     = new List <Pair <string, int> >();
                Pair <string, int>             pair     = new Pair <string, int>();
                Comparer <Pair <string, int> > comparer = pair.GetBySecondReverseComparer();

                while ((line = reader.ReadLine()) != null)
                {
                    array = line.Split('\t');
                    for (int i = 1; i < array.Length; i++)
                    {
                        pairString  = new string[] { array[i].Substring(0, array[i].LastIndexOf(":")), array[i].Substring(array[i].LastIndexOf(":") + 1) };
                        pair        = new Pair <string, int>();
                        pair.first  = pairString[0];
                        pair.second = int.Parse(pairString[1]);
                        list.Add(pair);
                    }
                    list.Sort(comparer);
                    foreach (Pair <string, int> item in list)
                    {
                        writer.Write("\t" + item.first + ":" + item.second);
                    }
                    writer.Write("\r");
                    list.Clear();
                }
                reader.Close();
                writer.Close();
            }
            if (true)
            {
                FileReader reader = new LargeFileReader(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\tmp.txt");
                FileWriter writer = new LargeFileWriter(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\tmp2.txt", FileMode.Create);
                String     line;
                string[]   lines = new string[3];
                string[]   array;
                string[]   pairString;
                Dictionary <string, int>[]     dics     = new Dictionary <string, int> [3];
                List <Pair <string, int> >     list     = new List <Pair <string, int> >();
                Pair <string, int>             pair     = new Pair <string, int>();
                Comparer <Pair <string, int> > comparer = pair.GetBySecondReverseComparer();

                for (int i = 0; i < 3; i++)
                {
                    line    = reader.ReadLine();
                    array   = line.Split('\t');
                    dics[i] = new Dictionary <string, int>();
                    if (i == 0)
                    {
                        for (int j = 1; j < array.Length; j++)
                        {
                            pairString          = new string[] { array[j].Substring(0, array[j].LastIndexOf(":")), array[j].Substring(array[j].LastIndexOf(":") + 1) };
                            pair                = new Pair <string, int>();
                            pair.first          = pairString[0];
                            pair.second         = int.Parse(pairString[1]);
                            dics[i][pair.first] = pair.second;
                            list.Add(pair);
                        }
                    }
                    else
                    {
                        for (int j = 1; j < array.Length; j++)
                        {
                            pairString             = new string[] { array[j].Substring(0, array[j].LastIndexOf(":")), array[j].Substring(array[j].LastIndexOf(":") + 1) };
                            dics[i][pairString[0]] = int.Parse(pairString[1]);
                        }
                    }
                }
                list.Sort(comparer);
                int count = 10;
                int locNum;
                int orgNum;
                foreach (Pair <string, int> item in list)
                {
                    count++;
                    try
                    {
                        locNum = dics[1][item.first];
                    }
                    catch (Exception)
                    {
                        locNum = 0;
                    }
                    try
                    {
                        orgNum = dics[2][item.first];
                    }catch (Exception)
                    {
                        orgNum = 0;
                    }
                    writer.Write("\t" + item.first + ":(" + item.second + "|" + locNum + "|" + orgNum + ")");
                    if (count % 5 == 0)
                    {
                        writer.Write("\r");
                    }
                }
                reader.Close();
                writer.Close();
            }
        }
コード例 #28
0
ファイル: Test.cs プロジェクト: v-mipeng/EntityTyping
        /// <summary>
        /// map types
        /// </summary>
        public static void Temp2()
        {
            var dbpediaToSatoriDic = new Dictionary <string, Dictionary <string, int> >();
            var satoriMentionDic   = new Dictionary <string, string>();
            var satoriEntityDic    = new Dictionary <string, string>();
            var dbpedia            = @"D:\Data\DBpedia\entity type pairs.txt";
            var satori             = @"D:\Codes\C#\EntityTyping\Fine-ner\input\feature\train.txt";
            var des                      = @"D:\Codes\C#\EntityTyping\Fine-ner\input\db2satori.txt";
            var dbpediaReader            = new LargeFileReader(dbpedia);
            var satoriReader             = new LargeFileReader(satori);
            var writer                   = new LargeFileWriter(des, FileMode.Create);
            var line                     = "";
            Dictionary <string, int> dic = null;

            while ((line = satoriReader.ReadLine()) != null)
            {
                var array = line.Split('\t');
                satoriMentionDic[array[0]] = array[2];
                satoriEntityDic[array[1]]  = array[2];
            }
            satoriReader.Close();
            System.Text.RegularExpressions.Regex regex       = new System.Text.RegularExpressions.Regex(@"\s+");
            System.Text.RegularExpressions.Regex deleteBrace = new System.Text.RegularExpressions.Regex(@"\(\w+\)");
            var count = 0;

            while ((line = dbpediaReader.ReadLine()) != null)
            {
                if (++count % 10000 == 0)
                {
                    Console.WriteLine(count);
                }
                var array  = line.Split('\t');
                var entity = deleteBrace.Replace(array[0], "");
                entity = regex.Replace(entity, " ").Trim();
                if (satoriEntityDic.ContainsKey(entity))
                {
                    try
                    {
                        dic = dbpediaToSatoriDic[array[1]];
                    }
                    catch (Exception)
                    {
                        dic = new Dictionary <string, int>();
                        dbpediaToSatoriDic[array[1]] = dic;
                    }
                    try
                    {
                        dic[satoriEntityDic[entity]] += 1;
                    }
                    catch (Exception)
                    {
                        dic[satoriEntityDic[entity]] = 1;
                    }
                }
                else if (satoriMentionDic.ContainsKey(entity))
                {
                    try
                    {
                        dic = dbpediaToSatoriDic[array[1]];
                    }
                    catch (Exception)
                    {
                        dic = new Dictionary <string, int>();
                        dbpediaToSatoriDic[array[1]] = dic;
                    }
                    try
                    {
                        dic[satoriMentionDic[entity]] += 1;
                    }
                    catch (Exception)
                    {
                        dic[satoriMentionDic[entity]] = 1;
                    }
                }
            }
            dbpediaReader.Close();
            foreach (var item in dbpediaToSatoriDic)
            {
                foreach (var d in item.Value)
                {
                    writer.WriteLine(item.Key + "\t" + d.Key + "\t" + d.Value);
                }
            }
            writer.Close();
        }
コード例 #29
0
ファイル: DataSpliter.cs プロジェクト: v-mipeng/EntityTyping
        public void SplitData()
        {
            var sourceDic           = LoadTotalNumByType();
            var mentionNumDic       = new Dictionary <string, int>();
            var uniqueMentionNumDic = new Dictionary <string, HashSet <string> >();
            // create reader by file
            var files  = Directory.GetFiles(this.sourceDir);
            var reader = new LargeFileReader();
            // create file path to store train, develop and test data
            var trainFiles = new List <string>();
            var devFiles   = new List <string>();
            var testFiles  = new List <string>();

            foreach (var file in files)
            {
                trainFiles.Add(Path.Combine(trainDir, Path.GetFileName(file)));
                devFiles.Add(Path.Combine(developDir, Path.GetFileName(file)));
                testFiles.Add(Path.Combine(testDir, Path.GetFileName(file)));
            }
            var writers = new List <FileWriter>();
            // random value generator to seperate develop and test data
            var    random = new Random();
            string line;

            string[]         array;
            int              num                      = 0;
            int              trainNumLimit            = 500000;
            HashSet <string> set                      = null;
            int              count                    = 0;
            int              limitMentionNumPerEntity = 10;
            int              numByEntity              = 0;
            int              devNumLimit              = 4000;
            int              i = 0;

            files = new string[] { @"E:\Users\v-mipeng\Codes\Projects\EntityTyping\Fine-ner\input\satori\refined-satori\time_event.txt" };
            trainFiles.Clear();
            trainFiles.Add(@"E:\Users\v-mipeng\Codes\Projects\EntityTyping\Fine-ner\input\satori\train\time_event.txt");
            devFiles.Clear();
            devFiles.Add(@"E:\Users\v-mipeng\Codes\Projects\EntityTyping\Fine-ner\input\satori\develop\time_event.txt");
            testFiles.Clear();
            testFiles.Add(@"E:\Users\v-mipeng\Codes\Projects\EntityTyping\Fine-ner\input\satori\test\time_event.txt");

            foreach (var file in files)
            {
                reader.Open(file);
                string lastEntity = "";
                writers.Clear();
                writers.Add(new LargeFileWriter(devFiles[i], FileMode.Create));
                writers.Add(new LargeFileWriter(testFiles[i], FileMode.Create));
                var trainWriter = new LargeFileWriter(trainFiles[i], FileMode.Create);
                i++;
                int devOrTestNum = 0;

                while ((line = reader.ReadLine()) != null)
                {
                    if (++count % 10000 == 0)
                    {
                        Console.WriteLine(count);
                    }
                    array = line.Split('\t');
                    if (array[1].Equals(lastEntity))
                    {
                        numByEntity++;
                    }
                    else
                    {
                        numByEntity = 1;
                        lastEntity  = array[1];
                    }
                    if (numByEntity > limitMentionNumPerEntity)
                    {
                        continue;
                    }
                    mentionNumDic.TryGetValue(array[2], out num);
                    if (num < trainNumLimit && num < 0.8 * sourceDic[array[2]] / limitMentionNumPerEntity)
                    {
                        SaveForTrain(trainWriter, line);
                        mentionNumDic[array[2]] = num + 1;
                        uniqueMentionNumDic.TryGetValue(array[2], out set);
                        if (set == null)
                        {
                            set = uniqueMentionNumDic[array[2]] = new HashSet <string>();
                        }
                        if (!set.Contains(array[0]))
                        {
                            set.Add(array[0]);
                        }
                    }
                    else if (devOrTestNum < devNumLimit * 2)
                    {
                        devOrTestNum++;
                        SaveForDevOrTest(writers[random.Next(0, 2)], line);
                    }
                }
                reader.Close();
                trainWriter.Close();
                writers[0].Close();
                writers[1].Close();
            }
            var writer = new LargeFileWriter(statisticInfoFile, FileMode.Create);

            foreach (var key in mentionNumDic.Keys)
            {
                writer.WriteLine(key + "\t" + mentionNumDic[key]);
                writer.WriteLine(key + "\t" + uniqueMentionNumDic[key].Count);
            }
            writer.Close();
            foreach (var file in trainFiles)
            {
                File.SetAttributes(file, FileAttributes.ReadOnly);
            }
            foreach (var file in devFiles)
            {
                File.SetAttributes(file, FileAttributes.ReadOnly);
            }
            foreach (var file in testFiles)
            {
                File.SetAttributes(file, FileAttributes.ReadOnly);
            }
        }
コード例 #30
0
        public void EvaluateResult(string resultFile, string evaluationFile)
        {
            var reader                   = new LargeFileReader(resultFile);
            var line                     = "";
            var result                   = new Dictionary <string, Dictionary <string, int> >(); // class-->(predicted class --> number)
            int times                    = 0;
            var trueLabelIndex           = 1;
            var predictLabelIndex        = 2;
            var writer                   = new LargeFileWriter(evaluationFile, FileMode.Create);
            Dictionary <string, int> dic = null;

            line = reader.ReadLine();

            while ((line = reader.ReadLine()) != null)
            {
                var array = line.Split('\t');
                try
                {
                    dic = result[array[trueLabelIndex]];
                    try
                    {
                        times = dic[array[predictLabelIndex]];
                        dic[array[predictLabelIndex]] = times + 1;
                    }
                    catch (Exception)
                    {
                        dic[array[predictLabelIndex]] = 1;
                    }
                }
                catch (Exception)
                {
                    dic           = new Dictionary <string, int>();
                    dic[array[2]] = 1;
                    result[array[trueLabelIndex]] = dic;
                }
            }
            reader.Close();
            writer.Write("True|Predict");
            var keys = result.Keys;

            foreach (var key in keys)
            {
                writer.Write("\t" + key);
            }
            writer.WriteLine("");
            foreach (var key in keys)
            {
                writer.Write(key);
                var info = result[key];

                foreach (var k in keys)
                {
                    if (info.TryGetValue(k, out times))
                    {
                        writer.Write("\t" + times);
                    }
                    else
                    {
                        writer.Write("\t" + 0);
                    }
                }
                writer.WriteLine("");
            }
            var macroPre = Util.GetMacroPrecision(result);
            var macroRec = Util.GetMacroRecall(result);
            var macroF1  = Util.GetF1(macroPre, macroRec);

            writer.WriteLine("macro-precision: " + macroPre);
            writer.WriteLine("macro-recall   : " + macroRec);
            writer.WriteLine("macro-F1       : " + macroF1);
            var microPre = Util.GetMicroPrecision(result);

            writer.WriteLine("micro-precision: " + microPre);
            writer.Close();
        }