Exemple #1
0
        /// <summary>
        /// Load the hierarchy of interest types.
        /// </summary>
        private void LoadHierarchy()
        {
            FileReader reader = new LargeFileReader(hierarchyFile);

            this.low2top = new Dictionary <string, string>();
            string line;
            int    count = 0;

            while ((line = reader.ReadLine()) != null)
            {
                count++;
                if (count == 120)
                {
                    Console.Write("debug!");
                }
                var array = line.Split(new char[] { '\t' }, StringSplitOptions.RemoveEmptyEntries);
                if (line.Equals("medicine.drug"))
                {
                    Console.Write(line);
                }
                low2top[array[0]] = array[0];
                for (int i = 1; i < array.Length; i++)
                {
                    low2top[array[i]] = array[0];
                }
            }
            reader.Close();
        }
        private void LoadVectors(int size, int dimension)
        {
            FileReader reader = new LargeFileReader(vectorFile);

            this.vectors = new double[size][];
            this.words   = new List <string>();

            string line;
            int    index = 0;

            while ((line = reader.ReadLine()) != null)
            {
                line = line.Trim();
                var array = line.Split(this.seperator);
                if (array.Length != dimension + 1)
                {
                    continue;
                }
                words.Add(array[0]);
                var vector = new double[dimension];
                for (int i = 1; i < array.Length; i++)
                {
                    vector[i - 1] = double.Parse(array[i]);
                }
                this.vectors[index] = vector;
                index++;
            }
            reader.Close();
        }
Exemple #3
0
        /// <summary>
        ///    Extract features for bayes model
        /// </summary>
        /// <param name="source">
        ///    File path storing the data from which this program extract features.
        /// </param>
        /// <param name="des">
        ///    File path to store the extracted features.
        /// </param>
        private static void ExtractBayesFeature(string source, string des)
        {
            FileReader reader       = new LargeFileReader(source);
            FileWriter writer       = new LargeFileWriter(des, FileMode.Create);
            var        lines        = reader.ReadAllLines().ToList();
            const int  numPerThread = 10000;
            var        threadNum    = (int)Math.Ceiling(1.0 * lines.Count / numPerThread);
            var        childThreads = new Thread[threadNum];
            var        tmpFiles     = new string[threadNum];

            for (var i = 0; i < threadNum; i++)
            {
                tmpFiles[i] = "./tmp" + i + ".txt";
                var threadClass = new BayesFeatureThread(lines.GetRange(numPerThread * i, Math.Min(numPerThread, lines.Count - numPerThread * i)), tmpFiles[i]);
                childThreads[i]      = new Thread(threadClass.ThreadMain);
                childThreads[i].Name = "thread " + i;
                childThreads[i].Start();
            }
            for (var i = 0; i < threadNum; i++)
            {
                childThreads[i].Join();
            }
            foreach (var tmpFile in tmpFiles)
            {
                var text = File.ReadAllText(tmpFile);
                writer.Write(text);
                File.Delete(tmpFile);
            }
        }
Exemple #4
0
        internal static HashSet <string> GetFields(string sourceFile)
        {
            FileReader reader = new LargeFileReader(sourceFile);
            string     line;
            var        count = 0;
            var        dic   = new Dictionary <string, int>();

            while ((line = reader.ReadLine()) != null)
            {
                if (++count > 100)
                {
                    break;
                }
                var mc = fieldRegex.Matches(line);
                foreach (Match match in mc)
                {
                    try
                    {
                        dic[match.Groups[1].Value] += 1;
                    }
                    catch (Exception)
                    {
                        dic[match.Groups[1].Value] = 1;
                    }
                }
            }
            reader.Close();
            var fields = new HashSet <string>();

            foreach (var key in dic.Keys.Where(key => (1.0 * dic[key] / count) > 0.95))
            {
                fields.Add(key);
            }
            return(fields);
        }
        static void Temp()
        {
            var sourceDir = @"D:\Codes\Project\EntityTyping\Fine-ner\input\tmp\";
            var des       = @"D:\Codes\Project\EntityTyping\Fine-ner\input\keywords.txt";
            var files     = Directory.GetFiles(sourceDir);
            var reader    = new LargeFileReader();
            var writer    = new LargeFileWriter(des, FileMode.Create);
            var line      = "";
            var keyWords  = new HashSet <string>();

            foreach (var file in files)
            {
                reader.Open(file);
                int count = 0;
                while ((line = reader.ReadLine()) != null)
                {
                    count++;
                    if (count > 100)
                    {
                        break;
                    }
                    var array = line.Split('\t');
                    keyWords.Add(array[0]);
                }
            }
            reader.Close();
            foreach (var word in keyWords)
            {
                writer.WriteLine(word);
            }
            writer.Close();
        }
Exemple #6
0
        /// <summary>
        ///       Load train data from file
        /// </summary>
        /// <param name="sourceFile">
        ///       File path of the train data
        /// </param>
        /// <format>
        ///     [Label]   TAB     [FieldName]:{[value1],[value2]...}    TAB     [FieldName]:{[value1],[value2]...}  ...
        /// </format>
        /// <returns>
        ///     List of object(actually a dictionary)
        ///     [class label]-->[field name-->list of values]
        /// </returns>
        internal static List <Pair <string, Dictionary <string, List <string> > > > LoadBayesData(string sourceFile)
        {
            FileReader reader = new LargeFileReader(sourceFile);
            string     line;
            var        pairs      = new List <Pair <string, Dictionary <string, List <string> > > >();
            var        labelRegex = new Regex("^[^\t]*");
            var        fieldRegex = new Regex(@"\t([^:]*):{([^}]*)}");
            var        fields     = GetFields(sourceFile);
            var        count      = 0;

            while ((line = reader.ReadLine()) != null)
            {
                count++;
                var pair = new Pair <string, Dictionary <string, List <string> > >();
                var dic  = new Dictionary <string, List <string> >(fields.Count);
                pair.first = labelRegex.Match(line).Value;
                var mc = fieldRegex.Matches(line);
                foreach (Match match in mc)
                {
                    if (!fields.Contains(match.Groups[1].Value))
                    {
                        Console.WriteLine("Invalid format in line{0}", count);
                        continue;
                    }
                    dic[match.Groups[1].Value] = match.Groups[2].Value.Split(new char[] { ',' }, StringSplitOptions.RemoveEmptyEntries).ToList();
                }
                pair.second = dic;
                pairs.Add(pair);
            }
            reader.Close();
            return(pairs);
        }
Exemple #7
0
        private static void LoadWordTable()
        {
            lock (wordTableLocker)
            {
                if (word2index == null)
                {
                    FileReader reader = null;
                    reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.word_table_file));
                    String line;
                    var    dic = new Dictionary <string, int>();

                    while ((line = reader.ReadLine()) != null)
                    {
                        var array = line.Split('\t');
                        try
                        {
                            var count = dic.Count;
                            dic[array[0]] = count;
                        }
                        catch (Exception)
                        {
                            continue;
                        }
                    }
                    reader.Close();
                    word2index = dic;
                }
            }
        }
Exemple #8
0
        /// <summary>
        /// Mention words are seperated by "_"
        /// </summary>
        private static void LoadMentionClusterID()
        {
            lock (mentionIDLocker)
            {
                if (mentionIdDic == null)
                {
                    var           dic    = new Dictionary <string, int>();
                    FileReader    reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.mention_id_file));
                    string        line;
                    string[]      array;
                    HashSet <int> ids = new HashSet <int>();
                    System.Text.RegularExpressions.Regex regex = new System.Text.RegularExpressions.Regex(@"_+");

                    while ((line = reader.ReadLine()) != null)
                    {
                        array = line.Split('\t');
                        try
                        {
                            var id = int.Parse(array[1]);
                            ids.Add(id);
                            array[0]      = regex.Replace(array[0], " ");
                            dic[array[0]] = id;
                        }
                        catch (Exception)
                        {
                            continue;
                        }
                    }
                    reader.Close();
                    mentionClusterSize = ids.Count;
                    mentionIdDic       = dic;
                }
            }
        }
Exemple #9
0
        private static void LoadStemMap()
        {
            lock (stemmerLocker)
            {
                if (stemWordDic == null)
                {
                    var        dic    = new Dictionary <string, string>();
                    FileReader reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.stem_map));
                    //FileReader reader = new LargeFileReader(@"D:\Codes\Project\EntityTyping\Fine-ner\input\tables\stem-word-table.txt");
                    string   line;
                    string[] array;

                    while ((line = reader.ReadLine()) != null)
                    {
                        array = line.Split('\t');
                        try
                        {
                            dic[array[0]] = array[1];
                        }
                        catch (Exception)
                        {
                            continue;
                        }
                    }
                    reader.Close();
                    stemWordDic = dic;
                }
            }
        }
Exemple #10
0
        private static void LoadWordClusterID()
        {
            lock (wordIDLocker)
            {
                if (wordIdDic == null)
                {
                    var           dic    = new Dictionary <string, int>();
                    FileReader    reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.word_id_file));
                    string        line;
                    string[]      array;
                    HashSet <int> ids = new HashSet <int>();

                    while ((line = reader.ReadLine()) != null)
                    {
                        array = line.Split('\t');
                        try
                        {
                            var id = int.Parse(array[1]);
                            ids.Add(id);
                            dic[array[0]] = id;
                        }
                        catch (Exception)
                        {
                            continue;
                        }
                    }
                    reader.Close();
                    wordClusterSize = ids.Count;
                    wordIdDic       = dic;
                }
            }
        }
Exemple #11
0
        /*Read Dictionary from  file
         */
        private static void LoadDictionary()
        {
            lock (dicLocker)
            {
                if (dicTypeMap == null)
                {
                    FileReader    reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.dic_file));
                    String        line;
                    List <String> list;
                    dics = new Dictionary <string, List <string> >();
                    var dic = new Dictionary <String, int>();
                    HashSet <String> set = new HashSet <String>();

                    while ((line = reader.ReadLine()) != null)
                    {
                        list = line.Split('\t').ToList();
                        List <String> strs = list.GetRange(1, list.Count - 1);
                        dics[list[0]] = strs;
                        strs.ForEach(x => set.Add(x));
                    }
                    foreach (var type in set)
                    {
                        dic[type] = dic.Count;
                    }
                    reader.Close();
                    dicTypeMap = dic;
                }
            }
        }
        private int GetVectorDimension()
        {
            FileReader reader = new LargeFileReader(vectorFile);
            string     line;

            char[]   seperators = new char[] { '\t', ' ' };
            string[] array;
            line = reader.ReadLine().Trim();
            double d;

            foreach (var c in seperators)
            {
                array = line.Split(c);
                if (array.Length > 1 && double.TryParse(array[1], out d))
                {
                    seperator = c;
                    break;
                }
            }
            if (seperator == (char)0)
            {
                throw new Exception("Cannot parse word vector file with default seperators:TAB and Space!\r" +
                                    "Please check your file format!");
            }
            array = line.Split(seperator);
            return(array.Length - 1);
        }
Exemple #13
0
        private static void LoadPosTagTable()
        {
            lock (posTagLocker)
            {
                if (posTag2index == null)
                {
                    var dic = new Dictionary <string, int>();

                    FileReader reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.posTag_table_file));
                    String     line;

                    while ((line = reader.ReadLine()) != null)
                    {
                        try
                        {
                            var count = dic.Count;
                            dic[line] = count;
                        }
                        catch (Exception)
                        {
                            continue;
                        }
                    }
                    reader.Close();
                    posTag2index = dic;
                }
            }
        }
Exemple #14
0
        public static void LoadDBpediaRedirect()
        {
            lock (dbpediaRedirectLocker)
            {
                if (redirects == null)
                {
                    var dic    = new Dictionary <string, string>();
                    var dic2   = new Dictionary <string, string>();
                    var dic3   = new Dictionary <string, string>();
                    var reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.dbpedia_redirect_file));
                    var line   = "";
                    System.Text.RegularExpressions.Regex regex       = new System.Text.RegularExpressions.Regex(@"_+");
                    System.Text.RegularExpressions.Regex deleteBrace = new System.Text.RegularExpressions.Regex(@"\(\w+\)");

                    while ((line = reader.ReadLine()) != null)
                    {
                        line = line.ToLower();
                        var array  = line.Split('\t');
                        var source = deleteBrace.Replace(array[0], "");
                        source = regex.Replace(source, " ").Trim();
                        var des = deleteBrace.Replace(array[1], "");
                        des         = regex.Replace(des, " ").Trim();
                        dic[source] = des;
                        var source2 = deleteSpace.Replace(source, "");
                        var des2    = deleteSpace.Replace(des, "");
                        dic2[source2] = des2;
                        dic3[source2] = des;
                    }
                    reader.Close();
                    redirects                       = dic;
                    redirectsWithoutSpace           = dic2;
                    redirectsWithoutSpace2WithSpace = dic3;
                }
            }
        }
Exemple #15
0
        public static string StatisticItemNumberByType(String sourceFile)
        {
            FileReader reader = new LargeFileReader(sourceFile);
            Dictionary <string, int> NumByType = new Dictionary <string, int>();
            string line;

            String[] array;

            int total = 0;

            while ((line = reader.ReadLine()) != null)
            {
                total++;
                array = line.Split('\t');
                try
                {
                    NumByType[array[1]] += 1;
                }
                catch (Exception)
                {
                    NumByType[array[1]] = 1;
                }
            }
            reader.Close();
            StringBuilder buffer = new StringBuilder();

            foreach (String type in NumByType.Keys)
            {
                buffer.Append("\t" + type + "\t" + NumByType[type] + "\r");
            }
            buffer.Append("\ttotal\t" + total);
            return(buffer.ToString());
        }
Exemple #16
0
        /************************************************************************/

        /* Statistic the co-occurrence rate between test data and train data
         *
         * /************************************************************************/
        public static string StatisticCooccurrence(String trainFilePath, String testFilePath)
        {
            string                   path1  = trainFilePath;
            string                   path2  = testFilePath;
            FileReader               reader = new LargeFileReader(path1);
            String                   line;
            HashSet <String>         set          = new HashSet <string>();
            Dictionary <string, int> hitNumByType = new Dictionary <string, int>();
            Dictionary <string, int> numByType    = new Dictionary <string, int>();

            string[] array;

            // store mentions of train data into a set
            while ((line = reader.ReadLine()) != null)
            {
                array = line.Split('\t');
                set.Add(array[0]);
            }
            reader.Close();
            reader.Open(path2);
            int total = 0;
            int coNum = 0;

            //  get test data
            while ((line = reader.ReadLine()) != null)
            {
                total++;
                array = line.Split('\t');
                if (set.Contains(line.Split('\t')[0]))
                {
                    try
                    {
                        hitNumByType[array[1]] += 1;
                    }
                    catch (Exception)
                    {
                        hitNumByType[array[1]] = 1;
                    }
                    coNum++;
                }
                try
                {
                    numByType[array[1]] += 1;
                }
                catch (Exception)
                {
                    numByType[array[1]] = 1;
                }
            }
            reader.Close();
            StringBuilder buffer = new StringBuilder();

            foreach (String type in numByType.Keys)
            {
                buffer.Append("\t" + type + "\t" + (hitNumByType.ContainsKey(type) ? hitNumByType[type] : 0) + "\t" + numByType[type] + "\t" + 1.0 * (hitNumByType.ContainsKey(type) ? hitNumByType[type] : 0) / numByType[type] + "\r");
            }
            buffer.Append("\ttotal coverage is: " + (1.0 * coNum / total));
            return(buffer.ToString());
        }
Exemple #17
0
        /************************************************************************/

        /* Statistic the coverage of the  dictionary
         * Note: all the entity is represent as lower case format*/
        /************************************************************************/
        public static string StatisticDicCoverage(String dicFile, String sourceFile)
        {
            string                   path1  = dicFile;
            string                   path2  = sourceFile;
            FileReader               reader = new LargeFileReader(path1);
            String                   line;
            HashSet <String>         set          = new HashSet <string>();
            Dictionary <string, int> hitNumByType = new Dictionary <string, int>();
            Dictionary <string, int> NumByType    = new Dictionary <string, int>();

            String[] array;

            while ((line = reader.ReadLine()) != null)
            {
                set.Add(line.Split('\t')[0]);
            }
            reader.Close();
            reader.Open(path2);
            int total = 0;
            int coNum = 0;

            while ((line = reader.ReadLine()) != null)
            {
                total++;
                array = line.Split('\t');
                if (set.Contains(array[0].ToLower()))
                {
                    coNum++;
                    try
                    {
                        hitNumByType[array[1]] += 1;
                    }
                    catch (Exception)
                    {
                        hitNumByType[array[1]] = 1;
                    }
                }
                try
                {
                    NumByType[array[1]] += 1;
                }
                catch (Exception)
                {
                    NumByType[array[1]] = 1;
                }
            }
            reader.Close();
            Console.WriteLine("dic coverage rate is: " + 1.0 * coNum / total);
            StringBuilder buffer = new StringBuilder();

            foreach (String type in NumByType.Keys)
            {
                buffer.Append("\t" + type + "\t" + (hitNumByType.ContainsKey(type) ? hitNumByType[type] : 0) + "\t" + NumByType[type] + "\t" + 1.0 * (hitNumByType.ContainsKey(type) ? hitNumByType[type] : 0) / NumByType[type] + "\r");
            }
            return(buffer.ToString());
        }
Exemple #18
0
        private static void LoadPreposition()
        {
            prepositions = new HashSet <String>();
            FileReader reader = new LargeFileReader();
            String     line;

            while ((line = reader.ReadLine()) != null)
            {
                prepositions.Add(line);
            }
        }
Exemple #19
0
        /// <summary>
        /// Load Bayes Statistic Model
        /// </summary>
        /// <param name="modelFile"></param>
        /// <returns></returns>
        public static Dictionary <string, Dictionary <string, Dictionary <string, int> > > LoadModel(string modelFile)
        {
            var        model  = new Dictionary <string, Dictionary <string, Dictionary <string, int> > >();
            FileReader reader = new LargeFileReader(modelFile);
            string     line;
            var        count = 0;
            var        regex = new Regex(@"^\w");

            Dictionary <string, Dictionary <string, int> > dicByField = null;
            var dicByValue = new Dictionary <string, int>();

            while ((line = reader.ReadLine()) != null)
            {
                count++;
                if (regex.IsMatch(line))
                {
                    // get new label or feild
                    var label = line;
                    try
                    {
                        dicByField = model[label];
                    }
                    catch (Exception)
                    {
                        dicByField   = new Dictionary <string, Dictionary <string, int> >();
                        model[label] = dicByField;
                    }
                    var field = reader.ReadLine();
                    try
                    {
                        dicByValue = dicByField[field];
                    }
                    catch (Exception)
                    {
                        dicByValue        = new Dictionary <string, int>();
                        dicByField[field] = dicByValue;
                    }
                }
                else
                {
                    line = line.Trim();
                    var array = line.Split('\t');
                    if (array.Length != 2)
                    {
                        Console.WriteLine("Wrong Format in line" + count);
                        continue;
                    }
                    dicByValue[array[0]] = int.Parse(array[1]);
                }
            }
            reader.Close();
            return(model);
        }
Exemple #20
0
        private void LoadWeight(string weightFilePath)
        {
            w = new Dictionary <string, double>();
            var    reader = new LargeFileReader(weightFilePath);
            string line;

            while ((line = reader.ReadLine()) != null)
            {
                var array = line.Split('\t');
                w[array[0]] = double.Parse(array[1]);
            }
            reader.Close();
        }
Exemple #21
0
        /* Train file format:
         *      Mention     Type    Context
         * Extract word table and word shape table from train data
         * Every word is converted to lowercase and stemmed
         * /************************************************************************/
        public void ExtractWordTable()
        {
            FileReader reader          = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.train_data_file));
            FileWriter writer          = new LargeFileWriter((string)GlobalParameter.Get(DefaultParameter.Field.word_table_file), FileMode.Create);
            FileWriter wordShapeWriter = new LargeFileWriter((string)GlobalParameter.Get(DefaultParameter.Field.word_shape_table_file), FileMode.Create);
            //FileWriter wordShapeWriter = new LargeFileWriter("../../../Fine-ner/input/shape-table-file.txt", FileMode.Create);

            string line           = null;
            var    wordTable      = new HashSet <string>();
            var    wordShapeTable = new HashSet <string>();

            while ((line = reader.ReadLine()) != null)
            {
                try
                {
                    var array     = line.Split('\t');
                    var tokenizer = TokenizerPool.GetTokenizer();
                    var words     = tokenizer.Tokenize(array[2]);
                    TokenizerPool.ReturnTokenizer(tokenizer);
                    foreach (var w in words)
                    {
                        if (!string.IsNullOrEmpty(w))   // w should not be empty
                        {
                            var shape = Feature.GetWordShape(w);
                            if (!wordShapeTable.Contains(shape))
                            {
                                wordShapeWriter.WriteLine(shape);
                                wordShapeTable.Add(shape);
                            }
                            var word = Generalizer.Generalize(w);
                            if (!wordTable.Contains(word))
                            {
                                writer.WriteLine(word);
                                wordTable.Add(word);
                            }
                        }
                    }
                }
                catch (Exception e)
                {
                    Console.WriteLine("=================error!===============");
                    Console.WriteLine("\t" + e.Message);
                    Console.WriteLine("\t" + e.StackTrace);
                    Console.WriteLine("=================error!===============");
                    continue;
                }
            }
            reader.Close();
            writer.Close();
        }
Exemple #22
0
        private Dictionary <string, int> LoadTotalNumByType()
        {
            var    dic    = new Dictionary <string, int>();
            var    reader = new LargeFileReader(this.sourceFileInfoFile);
            string line;

            while ((line = reader.ReadLine()) != null)
            {
                var array = line.Split('\t');
                dic[array[0]] = int.Parse(array[1]);
            }
            reader.Close();
            return(dic);
        }
Exemple #23
0
        /// <summary>
        /// Compare the result of old model and the newest model
        /// </summary>
        /// <param name="resultFile1"></param>
        /// File path storing the old result
        /// <param name="resultFile2"></param>
        /// File path storing the new result
        public static string CompareResult(string resultFile1, string resultFile2)
        {
            var itemLabels = new HashSet <string>();
            var positiveItemsInResultOne = new HashSet <string>();
            var positiveItemsInResultTwo = new HashSet <string>();
            var negtiveItemsInResultOne  = new HashSet <string>();
            var negtiveItemsInResultTwo  = new HashSet <string>();

            string     line;
            FileReader reader = new LargeFileReader(resultFile1);

            while ((line = reader.ReadLine()) != null)
            {
                var array = line.Split('\t');
                itemLabels.Add(array[0]);
                if (array[2].StartsWith(array[1]))
                {
                    positiveItemsInResultOne.Add(array[0]);
                }
                else
                {
                    negtiveItemsInResultOne.Add(array[0]);
                }
            }
            reader.Open(resultFile2);
            while ((line = reader.ReadLine()) != null)
            {
                var array = line.Split('\t');
                if (array[2].StartsWith(array[1]))
                {
                    positiveItemsInResultTwo.Add(array[0]);
                }
                else
                {
                    negtiveItemsInResultTwo.Add(array[0]);
                }
            }
            reader.Close();
            StringBuilder report = null;
            var           pp     = GetIntersection(positiveItemsInResultOne, positiveItemsInResultTwo).Count;
            var           pn     = GetIntersection(positiveItemsInResultOne, negtiveItemsInResultTwo).Count;
            var           np     = GetIntersection(negtiveItemsInResultOne, positiveItemsInResultTwo).Count;
            var           nn     = GetIntersection(negtiveItemsInResultOne, negtiveItemsInResultTwo).Count;

            report.Append("old|new | right | wrong\r");
            report.Append(string.Format(" right  | {0} | {1}\r"), pp, pn);
            report.Append(string.Format(" wrong  | {0} | {1}\r"), np, nn);
            return(report.toString());
        }
Exemple #24
0
        public static int GetDimension(string modelFilePath)
        {
            int        count  = 0;
            FileReader reader = new LargeFileReader(modelFilePath);
            string     line;

            while ((line = reader.ReadLine()) != null)
            {
                if (line.Equals(BayesModel.END))
                {
                    count++;
                }
            }
            reader.Close();
            return(count);
        }
Exemple #25
0
        public static void ExtractUIUC()
        {
            string     source = @"E:\Users\v-mipeng\Data\Dictionary\name-list.freq.txt";
            FileReader reader = new LargeFileReader(source);
            string     des    = @"E:\Users\v-mipeng\Codes\C#\NLP\Fine-ner\data\name-list.txt";
            FileWriter writer = new LargeFileWriter(des, FileMode.OpenOrCreate);
            String     line;

            String[] array;

            while ((line = reader.ReadLine()) != null)
            {
                array = line.Split('\t');
                writer.WriteLine(array[0]);
            }
        }
Exemple #26
0
        /// <summary>
        /// Combine files given by sourceFiles into one file given by desFile
        /// </summary>
        /// <param name="sourceFiles">
        /// Source file pathes to be combined
        /// </param>
        /// <param name="desFile">
        /// The file path to store the combined file
        /// </param>
        public static void CombineFiles(IEnumerable <string> sourceFiles, string desFile)
        {
            var    reader = new LargeFileReader();
            var    writer = new LargeFileWriter(desFile, FileMode.Create);
            string line;

            foreach (var file in sourceFiles)
            {
                reader.Open(file);
                while ((line = reader.ReadLine()) != null)
                {
                    writer.WriteLine(line);
                }
            }
            reader.Close();
            writer.Close();
        }
            public void GetKeyWordInfo()
            {
                Console.WriteLine("Thread {0} start.", threadID);
                var reader          = new LargeFileReader(source);
                var wordOccurNumDic = new Dictionary <string, int>();
                var line            = "";
                var classNum        = 0;
                var tagger          = PosTaggerPool.GetPosTagger();
                var set             = new HashSet <string>();

                while ((line = reader.ReadLine()) != null)
                {
                    if (classNum > 10000)
                    {
                        break;
                    }
                    classNum++;
                    if (classNum % 1000 == 0)
                    {
                        Console.WriteLine("Thread {0} has processed: {1}", threadID, classNum);
                    }
                    var array = line.Split('\t');
                    var pairs = tagger.TagString(array[3]);
                    set.Clear();

                    foreach (var pair in pairs)
                    {
                        if (pair.second.StartsWith("N") || pair.second.StartsWith("V") || pair.second.StartsWith("J"))
                        {
                            var tokenStemmed = Generalizer.Generalize(pair.first).ToLower();
                            set.Add(tokenStemmed);
                        }
                    }
                    foreach (var token in set)
                    {
                        int num = 0;
                        wordOccurNumDic.TryGetValue(token, out num);
                        wordOccurNumDic[token] = num + 1;
                    }
                }
                reader.Close();
                PosTaggerPool.ReturnPosTagger(tagger);
                KeyWordSelector.tuples[threadID] = new Tuple(classNum, wordOccurNumDic);
            }
        // read word vectors from file
        private void LoadVectors()
        {
            if (this.dimension == 0)
            {
                this.dimension = GetVectorDimension();
            }
            if (this.size == 0)
            {
                FileReader reader = new LargeFileReader(vectorFile);
                this.words = new List <string>();
                var    vectors = new List <double[]>();
                string line;
                int    index = 0;

                while ((line = reader.ReadLine()) != null)
                {
                    line = line.Trim();
                    var array = line.Split(' ');
                    if (array.Length != dimension + 1)
                    {
                        continue;
                    }
                    words.Add(array[0]);
                    var vector = new double[dimension];
                    for (int i = 1; i < array.Length; i++)
                    {
                        vector[i - 1] = double.Parse(array[i]);
                    }
                    vectors.Add(vector);
                    index++;
                }
                reader.Close();
                this.size    = vectors.Count;
                this.vectors = new double[this.size][];
                for (int i = 0; i < this.size; i++)
                {
                    this.vectors[i] = vectors[i];
                }
            }
            else
            {
                LoadVectors(this.size, this.dimension);
            }
        }
Exemple #29
0
        /// <summary>
        /// Refine disambiguations file download from dbpedia
        /// </summary>
        /// <param name="sourceFile"></param>
        /// <param name="desFile"></param>
        public static void RefineAmbiguousItem(string sourceFile, string desFile)
        {
            var reader = new LargeFileReader(sourceFile);
            var writer = new LargeFileWriter(desFile, System.IO.FileMode.Create);
            var line   = "";

            System.Text.RegularExpressions.Regex sourceRegex      = new System.Text.RegularExpressions.Regex(@"/([^/>]+)>");
            System.Text.RegularExpressions.Regex deleteBraceRegex = new System.Text.RegularExpressions.Regex(@"_?\([^\)]+\)");

            System.Text.RegularExpressions.Regex desRegex = new System.Text.RegularExpressions.Regex(@"/([^/>]+)>\s\.$");
            var           dic  = new Dictionary <string, List <string> >(300000);
            List <string> list = null;

            reader.ReadLine();

            while ((line = reader.ReadLine()) != null)
            {
                var sourceMatch = sourceRegex.Match(line);
                var source      = sourceMatch.Groups[1].Value;
                source = deleteBraceRegex.Replace(source, "");
                var desMatch = desRegex.Match(line);
                if (dic.TryGetValue(source, out list))
                {
                    list.Add(desMatch.Groups[1].Value);
                }
                else
                {
                    list = new List <string>();
                    list.Add(desMatch.Groups[1].Value);
                    dic[source] = list;
                }
            }
            reader.Close();
            foreach (var item in dic)
            {
                writer.Write(item.Key);
                foreach (var des in item.Value)
                {
                    writer.Write("\t" + des);
                }
                writer.WriteLine("");
            }
            writer.Close();
        }
Exemple #30
0
        public static void LoadDBpedia()
        {
            lock (dbpediaDicLocker)
            {
                if (dbpediaEntity2Type == null)
                {
                    var    dic    = new Dictionary <string, object>();
                    object types  = null;
                    var    reader = new LargeFileReader((string)GlobalParameter.Get(DefaultParameter.Field.dbpedia_dic_file));
                    var    line   = "";
                    System.Text.RegularExpressions.Regex regex       = new System.Text.RegularExpressions.Regex(@"_+");
                    System.Text.RegularExpressions.Regex deleteBrace = new System.Text.RegularExpressions.Regex(@"\(\w+\)");

                    while ((line = reader.ReadLine()) != null)
                    {
                        line = line.ToLower();
                        var array  = line.Split('\t');
                        var entity = deleteBrace.Replace(array[0], "");
                        entity = regex.Replace(entity, "").Trim();    // does not contains space
                        if (dic.TryGetValue(entity, out types))
                        {
                            if (types.GetType().Equals(typeof(string)))
                            {
                                var set = new HashSet <string>();
                                set.Add((string)types);
                                set.Add(array[1]);
                                dic[entity] = set;
                            }
                            else
                            {
                                ((HashSet <string>)types).Add(array[1]);
                            }
                        }
                        else
                        {
                            dic[entity] = array[1];
                        }
                    }
                    reader.Close();
                    dbpediaEntity2Type = dic;
                }
            }
        }