Пример #1
0
        static void Train(Options options)
        {
            Dictionary<string, int> tagMap = new Dictionary<string, int>();
            Dictionary<string, int> featureMap = new Dictionary<string, int>();

            using (StreamReader reader = new StreamReader(options.DataFile))
            {
                string line;
                int t = 0;
                int f = 0;
                while ((line = reader.ReadLine()) != null)
                {
                    if (line != null && !string.IsNullOrEmpty(line.Trim()))
                    {
                        var tokens = Regex.Split(line, @"\s+");
                        if (!tagMap.ContainsKey(tokens[0]))
                        {
                            tagMap.Add(tokens[0], t++);
                        }

                        if (!options.UseFeatureHashing)
                        {
                            for (int i = 1; i < tokens.Length; i++)
                            {
                                if (!featureMap.ContainsKey(tokens[i]))
                                {
                                    featureMap.Add(tokens[i], f++);
                                }
                            }
                        }
                    }
                }
            }

            //key variables
            int numTags = tagMap.Count;
            int numFeatures = options.UseFeatureHashing ? (2 << options.HashBits) : featureMap.Count;

            Trainer trainer = new Trainer(numTags, numFeatures);
            int c = 0;

            //use binary cache for multiple passes
            string cacheFile = Path.Combine(options.BasePath, ".cache");
            IFormatter binaryFormatter = new BinaryFormatter();
            using (Stream writer = new GZipStream(new FileStream(cacheFile, FileMode.Create, FileAccess.Write), CompressionMode.Compress))
            {
                foreach (var instance in ReadInstances(tagMap, featureMap, options))
                {
                    trainer.LearnFromOneInstance(instance.WordsWithFeatures, instance.LabelledTags);
                    c++;

                    if (options.NumIterations > 1)
                    {
                        binaryFormatter.Serialize(writer, instance);
                    }
                }
            }

            Instance cachedInstance = null;
            for (int i = 1; i < options.NumIterations; i++)
            {
                Stream reader = new FileStream(cacheFile, FileMode.Open, FileAccess.Read);
                using (GZipStream compressedReader = new GZipStream(reader, CompressionMode.Decompress))
                { 
                    while (reader.Position < reader.Length && (cachedInstance = binaryFormatter.Deserialize(compressedReader) as Instance) != null)
                    {
                        trainer.LearnFromOneInstance(cachedInstance.WordsWithFeatures, cachedInstance.LabelledTags);
                        c++;
                    }
                }
            }

            //save model
            Model model = trainer.GetModel();
            string TagDictionary = Path.Combine(options.BasePath, "TagDictionary.txt");
            using (StreamWriter writer = new StreamWriter(TagDictionary))
            {
                foreach (var kvp in tagMap)
                    writer.WriteLine("{0}\t{1}", kvp.Key, kvp.Value);
            }

            if (!options.UseFeatureHashing)
            {
                string FeatureDictionary = Path.Combine(options.BasePath, "FeatureDictionary.txt");
                using (StreamWriter writer = new StreamWriter(FeatureDictionary))
                {
                    foreach (var kvp in featureMap)
                        writer.WriteLine("{0}\t{1}", kvp.Key, kvp.Value);
                }
            }

            string TagTransitionProbabilities = Path.Combine(options.BasePath, "TagTransitionProbabilities.txt");
            using (StreamWriter writer = new StreamWriter(TagTransitionProbabilities))
            {
                for (int t1 = 0; t1 < numTags; t1++)
                    for (int t2 = 0; t2 < numTags; t2++)
                        if (model.alphaTagPreviousTag[t1, t2] != 0)
                            writer.WriteLine("{0}\t{1}\t{2}", t1, t2, model.alphaTagPreviousTag[t1, t2]);
            }

            string TagFeatureProbabilities = Path.Combine(options.BasePath, "TagFeatureProbabilities.txt");
            using (StreamWriter writer = new StreamWriter(TagFeatureProbabilities))
            {
                for (int t = 0; t < numTags; t++)
                    for (int f = 0; f < numFeatures; f++)
                        if (model.alphaTagFeature[t, f] != 0)
                            writer.WriteLine("{0}\t{1}\t{2}", t, f, model.alphaTagFeature[t, f]);
            }
        }
Пример #2
0
 static Options ParseArguments(string[] args)
 {
     Options options = new Options();
     for (int i = 0; i < args.Length; i++)
     {
         if ("--usefeaturehashing".Equals(args[i], StringComparison.OrdinalIgnoreCase))
         {
             options.UseFeatureHashing = true;
             options.HashBits = int.Parse(args[++i]);
         }
         else if ("--basepath".Equals(args[i], StringComparison.OrdinalIgnoreCase))
         {
             options.BasePath = args[++i];
         }
         else if ("--numiterations".Equals(args[i], StringComparison.OrdinalIgnoreCase))
         {
             options.NumIterations = int.Parse(args[++i]);
         }
         else if ("--data".Equals(args[i], StringComparison.OrdinalIgnoreCase))
         {
             options.DataFile = args[++i];
         }
         else if ("--output".Equals(args[i], StringComparison.OrdinalIgnoreCase))
         {
             options.Output = args[++i];
         }
         else
         {
             throw new ArgumentException(args[i]);
         }
     }
     return options;
 }
Пример #3
0
        // Read instance from data file
        static IEnumerable<Instance> ReadInstances(
            Dictionary<string,int> tagMap,
            Dictionary<string,int> featureMap,
            Options options)
        {
            using (StreamReader reader = new StreamReader(options.DataFile))
            {
                string line = null;
                List<int[]> features = new List<int[]>();
                List<int> labelledTags = new List<int>();

                while (!reader.EndOfStream)
                {
                    //read one example
                    while (!string.IsNullOrEmpty((line = reader.ReadLine())))
                    {
                        string[] tokens = Regex.Split(line, @"\s+");

                        //if using feature hashing, dont maintain a feature name to id map
                        //instead use hasing to compute it on the fly
                        if (options.UseFeatureHashing)
                        {
                            features.Add(tokens
                                .Skip(1)
                                .Select(x =>
                                    MurMurHash3.Hash(Encoding.UTF8.GetBytes(x)) & ((2 << options.HashBits) - 1))
                                .Distinct()
                                .ToArray());

                        }
                        else
                        {
                            features.Add(tokens
                                .Skip(1)
                                .Where(x => featureMap.ContainsKey(x))
                                .Select(x => featureMap[x])
                                .Distinct()
                                .ToArray());
                        }

                        //ugly hack : only affects reported PR stats not final output
                        //if tag is unseen assume it to be O
                        labelledTags.Add(tagMap.ContainsKey(tokens[0]) ? tagMap[tokens[0]] : tagMap["O"]);
                    }

                    //skip the sequence if its too long
                    if (features.Count > Viterbi.MAX_WORDS)
                        continue;

                    yield return new Instance()
                    {
                        WordsWithFeatures = features.ToArray(),
                        LabelledTags = labelledTags.ToArray()
                    };

                    features.Clear();
                    labelledTags.Clear();
                }
            }
        }
Пример #4
0
        //tag sequences based on model provided
        static void Tag(Options options)
        {
            //read tag dictionary
            Dictionary<string, int> tagMap = new Dictionary<string,int>();
            Dictionary<int, string> rTagMap = new Dictionary<int, string>();

            string TagDictionary = Path.Combine(options.BasePath, "TagDictionary.txt");
            using (StreamReader reader = new StreamReader(TagDictionary))
            {
                string line;
                while ((line = reader.ReadLine()) != null)
                {
                    var tokens = Regex.Split(line, @"\s+");
                    tagMap.Add(tokens[0], int.Parse(tokens[1]));
                    rTagMap.Add(int.Parse(tokens[1]), tokens[0]);
                }
            }

            //read feature dictionary
            Dictionary<string, int> featureMap = new Dictionary<string, int>();
            if (!options.UseFeatureHashing)
            {
                string FeatureDictionary = Path.Combine(options.BasePath, "FeatureDictionary.txt");
                using (StreamReader reader = new StreamReader(FeatureDictionary))
                {
                    string line;
                    while ((line = reader.ReadLine()) != null)
                    {
                        var tokens = Regex.Split(line, @"\s+");
                        featureMap.Add(tokens[0], int.Parse(tokens[1]));
                    }
                }
            }

            //key variables
            int numTags = tagMap.Count;
            int numFeatures = options.UseFeatureHashing ? (2 << options.HashBits) : featureMap.Count;

            Model model = new Model()
            {
                alphaTagFeature = new float[numTags, numFeatures],
                alphaTagPreviousTag = new float[numTags, numTags]
            };

            //populate model
            string TagTransitionProbabilities = Path.Combine(options.BasePath, "TagTransitionProbabilities.txt");
            using (StreamReader reader = new StreamReader(TagTransitionProbabilities))
            {
                string line;
                while ((line = reader.ReadLine()) != null)
                {
                    var tokens = Regex.Split(line, @"\s+");
                    model.alphaTagPreviousTag[int.Parse(tokens[0]), int.Parse(tokens[1])] = float.Parse(tokens[2]);
                }
            }
            string TagFeatureProbabilities = Path.Combine(options.BasePath, "TagFeatureProbabilities.txt");
            using (StreamReader reader = new StreamReader(TagFeatureProbabilities))
            {
                string line;
                while ((line = reader.ReadLine()) != null)
                {
                    var tokens = Regex.Split(line, @"\s+");
                    model.alphaTagFeature[int.Parse(tokens[0]), int.Parse(tokens[1])] = float.Parse(tokens[2]);
                }
            }

            //mantian data for precision recall reports
            Dictionary<int, int> perTagCorrect = new Dictionary<int, int>();
            Dictionary<int, int> perTagCount = new Dictionary<int, int>();
            Dictionary<int, int> perTagModelCount = new Dictionary<int, int>();
            Enumerable.Range(0, numTags).ToList().ForEach(x =>
            {
                perTagCount[x] = 0;
                perTagCorrect[x] = 0;
                perTagModelCount[x] = 0;
            });

            int instanceCorrectCount = 0;
            int instanceCount = 0;

            Tagger tagger = new Tagger(numTags, model.alphaTagFeature, model.alphaTagPreviousTag);
            int[] tags = new int[Viterbi.MAX_WORDS];
            using (StreamWriter writer = new StreamWriter(options.Output))
            {
                foreach (var instance in ReadInstances(tagMap, featureMap, options))
                {
                    tagger.Label(instance.WordsWithFeatures, tags);
                    writer.WriteLine(string.Join(Environment.NewLine, Enumerable
                        .Range(0, instance.WordsWithFeatures.Length)
                        .Select(x => rTagMap[tags[x]])
                        .ToArray()));
                    writer.WriteLine();

                    bool allCorrect = true;
                    for (int i = 0; i < instance.WordsWithFeatures.Length; i++)
                    {
                        int correctTag = instance.LabelledTags[i];
                        int modelTag = tags[i];

                        perTagCount[correctTag]++;
                        perTagModelCount[modelTag]++;
                        if (correctTag == modelTag)
                        {
                            perTagCorrect[correctTag]++;
                        }
                        else
                        {
                            allCorrect = false;
                        }
                    }

                    if (allCorrect)
                        instanceCorrectCount++;
                    instanceCount++;
                }
            }
            for (int i = 0; i < numTags; i++)
            {
                Console.WriteLine(string.Join("\t", new object[] {
                        rTagMap[i],
                        perTagModelCount[i],
                        perTagCorrect[i],
                        perTagCount[i],
                        perTagCorrect[i] * 1.0 / perTagModelCount[i],
                        perTagCorrect[i] * 1.0 / perTagCount[i]
                    }.Select(x => x.ToString()).ToArray()));
            }
            Console.WriteLine(string.Join("\t", new object[] {
                instanceCorrectCount,
                instanceCount,
                instanceCorrectCount * 1.0 / instanceCount
            }.Select(x => x.ToString()).ToArray()));
        }