Exemple #1
0
        static void  ParseWord2Vec(string word2vecFile, string parsedFile)
        {
            var writer = new LargeFileWriter(parsedFile, FileMode.Create);
            var parser = new ParseBinaryVector(word2vecFile);
            int count  = 0;

            while (!parser.EOF)
            {
                if (++count % 1000 == 0)
                {
                    Console.WriteLine(count);
                }
                try
                {
                    var pair  = parser.GetNextVector();
                    var index = pair.first.IndexOf("en/");
                    writer.Write(pair.first.Substring(index + 3));
                    foreach (var value in pair.second)
                    {
                        writer.Write(string.Format(" {0}", value));
                    }
                    writer.Write("\r");
                }
                catch (Exception)
                {
                    continue;
                }
            }
            writer.Close();
        }
Exemple #2
0
        private void OutputTfIdf()
        {
            var writer   = new LargeFileWriter(vectorPath, FileMode.Create);
            int docLabel = 1;
            int num      = 0;

            ReadOneDoc();

            while (this.doc != null)
            {
                var tokenizer = TokenizerPool.GetTokenizer();

                var document = tokenizer.Tokenize(doc);
                TokenizerPool.ReturnTokenizer(tokenizer);
                if (++num % 1000 == 0)
                {
                    Console.WriteLine(num);
                }
                var vector = GetTfIdf(document);
                writer.Write(docLabel);
                foreach (var value in vector)
                {
                    writer.Write("\t" + value.first + ":" + value.second);
                }
                writer.Write("\r");
                ReadOneDoc();
                docLabel++;
            }
            writer.Close();
        }
        // save the centroid of clusters
        private void SaveCentroids()
        {
            var writer = new LargeFileWriter(centroidInfoFile, FileMode.Create);

            //foreach (var centroid in kmeans.Clusters.Centroids)
            foreach (var centroid in kmeans.Centroids)
            {
                foreach (var value in centroid)
                {
                    writer.Write(string.Format("{0}\t", value));
                }
                writer.Write("\r");
            }
            writer.Close();
        }
Exemple #4
0
        /// <summary>
        ///    Extract features for bayes model
        /// </summary>
        /// <param name="source">
        ///    File path storing the data from which this program extract features.
        /// </param>
        /// <param name="des">
        ///    File path to store the extracted features.
        /// </param>
        private static void ExtractBayesFeature(string source, string des)
        {
            FileReader reader       = new LargeFileReader(source);
            FileWriter writer       = new LargeFileWriter(des, FileMode.Create);
            var        lines        = reader.ReadAllLines().ToList();
            const int  numPerThread = 10000;
            var        threadNum    = (int)Math.Ceiling(1.0 * lines.Count / numPerThread);
            var        childThreads = new Thread[threadNum];
            var        tmpFiles     = new string[threadNum];

            for (var i = 0; i < threadNum; i++)
            {
                tmpFiles[i] = "./tmp" + i + ".txt";
                var threadClass = new BayesFeatureThread(lines.GetRange(numPerThread * i, Math.Min(numPerThread, lines.Count - numPerThread * i)), tmpFiles[i]);
                childThreads[i]      = new Thread(threadClass.ThreadMain);
                childThreads[i].Name = "thread " + i;
                childThreads[i].Start();
            }
            for (var i = 0; i < threadNum; i++)
            {
                childThreads[i].Join();
            }
            foreach (var tmpFile in tmpFiles)
            {
                var text = File.ReadAllText(tmpFile);
                writer.Write(text);
                File.Delete(tmpFile);
            }
        }
Exemple #5
0
        /// <summary>
        /// Refine disambiguations file download from dbpedia
        /// </summary>
        /// <param name="sourceFile"></param>
        /// <param name="desFile"></param>
        public static void RefineAmbiguousItem(string sourceFile, string desFile)
        {
            var reader = new LargeFileReader(sourceFile);
            var writer = new LargeFileWriter(desFile, System.IO.FileMode.Create);
            var line   = "";

            System.Text.RegularExpressions.Regex sourceRegex      = new System.Text.RegularExpressions.Regex(@"/([^/>]+)>");
            System.Text.RegularExpressions.Regex deleteBraceRegex = new System.Text.RegularExpressions.Regex(@"_?\([^\)]+\)");

            System.Text.RegularExpressions.Regex desRegex = new System.Text.RegularExpressions.Regex(@"/([^/>]+)>\s\.$");
            var           dic  = new Dictionary <string, List <string> >(300000);
            List <string> list = null;

            reader.ReadLine();

            while ((line = reader.ReadLine()) != null)
            {
                var sourceMatch = sourceRegex.Match(line);
                var source      = sourceMatch.Groups[1].Value;
                source = deleteBraceRegex.Replace(source, "");
                var desMatch = desRegex.Match(line);
                if (dic.TryGetValue(source, out list))
                {
                    list.Add(desMatch.Groups[1].Value);
                }
                else
                {
                    list = new List <string>();
                    list.Add(desMatch.Groups[1].Value);
                    dic[source] = list;
                }
            }
            reader.Close();
            foreach (var item in dic)
            {
                writer.Write(item.Key);
                foreach (var des in item.Value)
                {
                    writer.Write("\t" + des);
                }
                writer.WriteLine("");
            }
            writer.Close();
        }
Exemple #6
0
        public static void SelectInterestWordVector(string interestWordFile, string word2vecFile, string compressedWord2VectorFile)
        {
            var    reader = new LargeFileReader(interestWordFile);
            string line;
            var    set = new HashSet <string>();

            while ((line = reader.ReadLine()) != null)
            {
                set.Add(line.Trim());
            }
            reader.Close();
            var writer = new LargeFileWriter(compressedWord2VectorFile, FileMode.Create);
            var parser = new ParseBinaryVector(word2vecFile);
            int count  = 0;

            while (!parser.EOF)
            {
                if (++count % 1000 == 0)
                {
                    Console.WriteLine(count);
                }
                try
                {
                    var pair = parser.GetNextVector();
                    if (set.Contains(pair.first))
                    {
                        writer.Write(pair.first);
                        foreach (var value in pair.second)
                        {
                            writer.Write(string.Format(" {0}", value));
                        }
                        writer.Write("\r");
                    }
                }
                catch (Exception)
                {
                    continue;
                }
            }
            writer.Close();
        }
Exemple #7
0
        public static void Temp()
        {
            if (false)
            {
                var reader = new LargeFileReader(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\train\train.txt");
                var writer = new LargeFileWriter(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\train\limited train.txt", FileMode.OpenOrCreate);
                Dictionary <string, int> numByType = new Dictionary <string, int>(16);
                String   line;
                String[] array;
                int      count = 0;
                int      num   = 0;
                while ((line = reader.ReadLine()) != null)
                {
                    count++;
                    if (count % 1000 == 0)
                    {
                        Console.Error.WriteLine(count + " items processed!");
                    }
                    array = line.Split('\t');
                    try
                    {
                        num = numByType[array[1]];
                    }
                    catch (Exception)
                    {
                        num = 0;
                    }
                    if (num > 100000)   // do not limit train data number by type
                    {
                        continue;
                    }
                    writer.WriteLine(line);
                    numByType[array[1]] = ++num;
                }
                reader.Close();
                writer.Close();
            }

            if (false)
            {
                string     result  = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\result\satori_lc\-1.inst.txt";
                string     source  = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\satori_lc\develop.txt";
                String     tmpFile = "./tmp.txt";
                FileReader reader1 = new LargeFileReader(result);
                FileReader reader2 = new LargeFileReader(source);
                FileWriter writer  = new LargeFileWriter(tmpFile, FileMode.OpenOrCreate);
                String     line;
                String     line2;


                writer.WriteLine(reader1.ReadLine());

                while ((line = reader1.ReadLine()) != null)
                {
                    line2 = reader2.ReadLine();
                    writer.WriteLine(line2.Split('\t')[0] + "\t" + line.Split(new char[] { '\t' }, 2)[1]);
                }
                reader1.Close();
                reader2.Close();
                writer.Close();
                File.Copy(tmpFile, @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\result\satori_lc\.inst.txt");
                File.Delete(tmpFile);
            }
            if (false)
            {
                string           wordTableFile = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\word table\wordTable.txt";
                FileReader       reader        = new LargeFileReader(wordTableFile);
                FileWriter       writer        = new LargeFileWriter();
                HashSet <string> wordSet       = new HashSet <string>();
                string           line;

                while ((line = reader.ReadLine()) != null)
                {
                    //var stemmer = StemmerPool.GetStemmer();
                    //wordSet.Add(stemmer.Stem(line.Split('\t')[0])[0]);
                    //StemmerPool.ReturnStemmer(stemmer);
                    //stemmer = null;
                }
                reader.Close();
                writer.Open(wordTableFile);
                int i = 0;
                foreach (String word in wordSet)
                {
                    writer.WriteLine(word + '\t' + (i++));
                }
                writer.Close();
            }
            if (false)
            {
                String     dir    = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\names";
                string     des    = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\name-all.txt";
                FileReader reader = new LargeFileReader();
                FileWriter writer = new LargeFileWriter(des, FileMode.Create);
                string[]   files  = Directory.GetFiles(dir, "*.txt");
                string     line;

                foreach (String file in files)
                {
                    reader.Open(file);
                    while ((line = reader.ReadLine()) != null)
                    {
                        writer.WriteLine(line.Split(',')[0]);
                    }
                }
                reader.Close();
                writer.Close();
            }
            if (false)
            {
                string           path1  = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\name-all.txt";
                string           path2  = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\train\limited train.txt";
                string           des    = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\dictionary\tmp.txt";
                FileReader       reader = new LargeFileReader(path1);
                FileWriter       writer = new LargeFileWriter(des);
                String           line;
                HashSet <String> set = new HashSet <string>();
                String[]         array;

                while ((line = reader.ReadLine()) != null)
                {
                    set.Add(line);
                    array = line.Split(' ');
                }
                reader.Close();
                reader.Open(path2);

                while ((line = reader.ReadLine()) != null)
                {
                    array = line.Split('\t');

                    if (set.Contains(array[0].ToLower()))
                    {
                        if (!array[1].Equals("people.person"))
                        {
                            set.Remove(array[0].ToLower());
                        }
                    }
                }
                reader.Close();
                foreach (String name in set)
                {
                    writer.WriteLine(name);
                }
                writer.Close();
            }
            if (false)
            {
                FileReader       reader = new LargeFileReader(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\satori_lc\backup\version 1-2\develop.txt");
                FileWriter       writer = new LargeFileWriter(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\input\satori_lc\develop.txt", FileMode.OpenOrCreate);
                String           line;
                string[]         array;
                HashSet <string> interestTypes = new HashSet <string>();
                interestTypes.Add("people.person");
                interestTypes.Add("location.location");
                interestTypes.Add("organization.organization");
                while ((line = reader.ReadLine()) != null)
                {
                    array = line.Split('\t');
                    if (interestTypes.Contains(array[1]))
                    {
                        writer.WriteLine(line);
                    }
                }
                reader.Close();
                writer.Close();
            }
            if (false)
            {
                FileReader reader = new LargeFileReader(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\tmp.txt");
                FileWriter writer = new LargeFileWriter(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\tmp2.txt", FileMode.Create);
                String     line;
                string[]   array;
                string[]   pairString;
                List <Pair <string, int> >     list     = new List <Pair <string, int> >();
                Pair <string, int>             pair     = new Pair <string, int>();
                Comparer <Pair <string, int> > comparer = pair.GetBySecondReverseComparer();

                while ((line = reader.ReadLine()) != null)
                {
                    array = line.Split('\t');
                    for (int i = 1; i < array.Length; i++)
                    {
                        pairString  = new string[] { array[i].Substring(0, array[i].LastIndexOf(":")), array[i].Substring(array[i].LastIndexOf(":") + 1) };
                        pair        = new Pair <string, int>();
                        pair.first  = pairString[0];
                        pair.second = int.Parse(pairString[1]);
                        list.Add(pair);
                    }
                    list.Sort(comparer);
                    foreach (Pair <string, int> item in list)
                    {
                        writer.Write("\t" + item.first + ":" + item.second);
                    }
                    writer.Write("\r");
                    list.Clear();
                }
                reader.Close();
                writer.Close();
            }
            if (true)
            {
                FileReader reader = new LargeFileReader(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\tmp.txt");
                FileWriter writer = new LargeFileWriter(@"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\output\tmp2.txt", FileMode.Create);
                String     line;
                string[]   lines = new string[3];
                string[]   array;
                string[]   pairString;
                Dictionary <string, int>[]     dics     = new Dictionary <string, int> [3];
                List <Pair <string, int> >     list     = new List <Pair <string, int> >();
                Pair <string, int>             pair     = new Pair <string, int>();
                Comparer <Pair <string, int> > comparer = pair.GetBySecondReverseComparer();

                for (int i = 0; i < 3; i++)
                {
                    line    = reader.ReadLine();
                    array   = line.Split('\t');
                    dics[i] = new Dictionary <string, int>();
                    if (i == 0)
                    {
                        for (int j = 1; j < array.Length; j++)
                        {
                            pairString          = new string[] { array[j].Substring(0, array[j].LastIndexOf(":")), array[j].Substring(array[j].LastIndexOf(":") + 1) };
                            pair                = new Pair <string, int>();
                            pair.first          = pairString[0];
                            pair.second         = int.Parse(pairString[1]);
                            dics[i][pair.first] = pair.second;
                            list.Add(pair);
                        }
                    }
                    else
                    {
                        for (int j = 1; j < array.Length; j++)
                        {
                            pairString             = new string[] { array[j].Substring(0, array[j].LastIndexOf(":")), array[j].Substring(array[j].LastIndexOf(":") + 1) };
                            dics[i][pairString[0]] = int.Parse(pairString[1]);
                        }
                    }
                }
                list.Sort(comparer);
                int count = 10;
                int locNum;
                int orgNum;
                foreach (Pair <string, int> item in list)
                {
                    count++;
                    try
                    {
                        locNum = dics[1][item.first];
                    }
                    catch (Exception)
                    {
                        locNum = 0;
                    }
                    try
                    {
                        orgNum = dics[2][item.first];
                    }catch (Exception)
                    {
                        orgNum = 0;
                    }
                    writer.Write("\t" + item.first + ":(" + item.second + "|" + locNum + "|" + orgNum + ")");
                    if (count % 5 == 0)
                    {
                        writer.Write("\r");
                    }
                }
                reader.Close();
                writer.Close();
            }
        }
Exemple #8
0
        public void EvaluateResult(string resultFile, string evaluationFile)
        {
            var reader                   = new LargeFileReader(resultFile);
            var line                     = "";
            var result                   = new Dictionary <string, Dictionary <string, int> >(); // class-->(predicted class --> number)
            int times                    = 0;
            var trueLabelIndex           = 1;
            var predictLabelIndex        = 2;
            var writer                   = new LargeFileWriter(evaluationFile, FileMode.Create);
            Dictionary <string, int> dic = null;

            line = reader.ReadLine();

            while ((line = reader.ReadLine()) != null)
            {
                var array = line.Split('\t');
                try
                {
                    dic = result[array[trueLabelIndex]];
                    try
                    {
                        times = dic[array[predictLabelIndex]];
                        dic[array[predictLabelIndex]] = times + 1;
                    }
                    catch (Exception)
                    {
                        dic[array[predictLabelIndex]] = 1;
                    }
                }
                catch (Exception)
                {
                    dic           = new Dictionary <string, int>();
                    dic[array[2]] = 1;
                    result[array[trueLabelIndex]] = dic;
                }
            }
            reader.Close();
            writer.Write("True|Predict");
            var keys = result.Keys;

            foreach (var key in keys)
            {
                writer.Write("\t" + key);
            }
            writer.WriteLine("");
            foreach (var key in keys)
            {
                writer.Write(key);
                var info = result[key];

                foreach (var k in keys)
                {
                    if (info.TryGetValue(k, out times))
                    {
                        writer.Write("\t" + times);
                    }
                    else
                    {
                        writer.Write("\t" + 0);
                    }
                }
                writer.WriteLine("");
            }
            var macroPre = Util.GetMacroPrecision(result);
            var macroRec = Util.GetMacroRecall(result);
            var macroF1  = Util.GetF1(macroPre, macroRec);

            writer.WriteLine("macro-precision: " + macroPre);
            writer.WriteLine("macro-recall   : " + macroRec);
            writer.WriteLine("macro-F1       : " + macroF1);
            var microPre = Util.GetMicroPrecision(result);

            writer.WriteLine("micro-precision: " + microPre);
            writer.Close();
        }
Exemple #9
0
        public void Test()
        {
            if (model == null)
            {
                Initial();
            }
            var        fields = BayesModel.GetFields(sourceFile);
            FileReader reader = new LargeFileReader(sourceFile);
            FileWriter writer = new LargeFileWriter(resultFile, FileMode.Create);
            // actual label-->(prediced label-->times)
            var detailDic                = new Dictionary <string, Dictionary <string, int> >();
            var positiveNums             = new Dictionary <string, int>(); // positive number by type
            var predictedNums            = new Dictionary <string, int>(); // predicted number by type
            var actualNums               = new Dictionary <string, int>(); //  actual number by type
            Dictionary <string, int> dic = null;
            Pair <string, Dictionary <string, object> > feature = null;
            var i = 0;

            while ((feature = BayesModel.GetFeatureItem(reader, fields)) != null)
            {
                i++;
                var    label          = feature.first;
                string predictedLabel = null;
                try
                {
                    predictedLabel = Predict(feature.second);
                }
                catch (Exception)
                {
                    Console.WriteLine("Wrong!");
                    writer.WriteLine(i + "\t" + label + "\tNULL");
                    continue;
                }
                writer.Write(string.Format("{0}\t{1, -30}", i, label));
                foreach (var score in this.scores)
                {
                    writer.Write(string.Format("{0,30}:{1,-10:F2}", score.first, score.second));
                }
                writer.Write("\r");

                if (label.Equals(predictedLabel))
                {
                    try
                    {
                        positiveNums[label] += 1;
                    }
                    catch (Exception)
                    {
                        positiveNums[label] = 1;
                    }
                }
                try
                {        // update predicted number
                    predictedNums[predictedLabel] += 1;
                }
                catch (Exception)
                {
                    predictedNums[predictedLabel] = 1;
                }
                try
                {    // update actually number
                    actualNums[label] += 1;
                }
                catch (Exception)
                {
                    actualNums[label] = 1;
                }
                // update detail dictionary
                try
                {
                    dic = detailDic[label];
                }
                catch (Exception)
                {
                    dic = new Dictionary <string, int>();
                    detailDic[label] = dic;
                }
                try
                {
                    dic[predictedLabel] += 1;
                }
                catch (Exception)
                {
                    dic[predictedLabel] = 1;
                }
            }
            var buffer = new StringBuilder();

            buffer.Append(string.Format("{0,-30}", "actual label |predicted type"));
            foreach (var key in this.labels)
            {
                buffer.Append(string.Format("{0,-30}", key));
            }
            buffer.Append(string.Format("{0,-30}\r", "recall"));
            foreach (var key in this.labels)
            {
                buffer.Append(string.Format("{0,-30}", key));
                dic = detailDic[key];
                foreach (var k in this.labels)
                {
                    buffer.Append(string.Format("{0,-30}", dic[k]));
                }
                // recall
                buffer.Append(string.Format("{0,-30}\r", 1.0 * positiveNums[key] / actualNums[key]));
            }
            buffer.Append(string.Format("{0,-30}", "precision"));
            foreach (var key in this.labels)
            {
                buffer.Append(string.Format("{0,-30:f5}", 1.0 * positiveNums[key] / predictedNums[key]));
            }
            buffer.Append("\r");
            writer.WriteLine(buffer.ToString());
            writer.Close();
        }