public void Parse(string input, string output)
        {
            var readModel= new ReadModel(input);
            var writeModel = new WriteModel(output);
            var writeDevModel = new WriteModel(output+".dev");
            //var tempWrite = new WriteModel(output + "tempWrite");
            var temp = new List<string>();

            foreach (var line in readModel.GetNextLine())
            {
                var newLine = RemoveTags(line);
                newLine = ReplaceTags(newLine);
                newLine = RemoveAllTags(newLine);
                if (string.IsNullOrEmpty(newLine)) continue;

                //tempWrite.WriteLine(newLine);
                var split = newLine.Split(new char[] {' '});
                temp.AddRange(split.ToList());
                //temp.Add("##NEWLINE##");
            }
            //tempWrite.Flush();
            bool location = false;
            var lastStr = string.Empty;

            foreach (var tempStr in temp)
            {
                var str = tempStr.Trim();
                if (string.IsNullOrEmpty(str))
                {
                    lastStr = "";
                    continue;
                }

                //if (str.Equals("##NEWLINE##"))
                {
                    if (!location && lastStr.EndsWith(".") && !IsSalutationAbbr(lastStr))
                    {
                        lastStr = string.Empty;
                        writeModel.WriteLine("");
                        writeDevModel.WriteLine("");
                        continue;
                    }
                }
                if (location)
                {
                    if (str.Equals("##ENDTAG##"))
                    {
                        location = false;
                        lastStr = "";
                        continue;
                    }
                    writeModel.WriteLine(str + " " + "LOCATION");
                    writeDevModel.WriteLine(str);
                    lastStr = str;
                    continue;
                }
                if (str.Equals("##LOCATIONSTARTTAG##"))
                {
                    lastStr = "";
                    location = true;
                    continue;
                }
                if (str.Equals("##ENDTAG##"))
                {
                    lastStr = "";
                    continue;
                }
                writeModel.WriteLine(str + " "+ "OTHER");
                writeDevModel.WriteLine(str);
                lastStr = str;
            }
            writeModel.Flush();
            writeDevModel.Flush();
        }
        internal static void CreateInputForCRF(string input, string output)
        {
            var reader = new ReadModel(input);
            var keyWriter = new WriteModel(string.Concat(output, ".key"));
            var devWriter = new WriteModel(string.Concat(output, ".key.dev"));

            foreach (var line in reader.GetNextLine())
            {
                var words = line.Split(new[] {' '});

                if (words.Length < 4)
                    continue;

                foreach (var word in words)
                {
                    if (string.IsNullOrEmpty(word.Trim()))
                        continue;

                    if (word.EndsWith("{LOCATION}"))
                    {
                        keyWriter.WriteLine(word.Replace("{LOCATION}", "") + " " + "LOCATION");
                        devWriter.WriteLine(word.Replace("{LOCATION}", ""));
                    }
                    else if (word.EndsWith("{LOCATION}."))
                    {
                        keyWriter.WriteLine(word.Replace("{LOCATION}.", ".") + " " + "LOCATION");
                        devWriter.WriteLine(word.Replace("{LOCATION}.", "."));
                    }
                    else
                    {
                        keyWriter.WriteLine(word + " " + "OTHER");
                        devWriter.WriteLine(word);
                    }
                }
                keyWriter.WriteLine("");
                devWriter.WriteLine("");
            }
            keyWriter.Flush();
            devWriter.Flush();
        }
Beispiel #3
0
        private Config()
        {
            BlackList = new HashSet<string>();
            var readBlackList = new ReadModel(blackList);
            foreach (var line in readBlackList.GetNextLine())
            {
                var word = line.ToLowerInvariant().Trim();
                if (string.IsNullOrEmpty(word))
                {
                    continue;
                }
                BlackList.Add(word);
            }
            PronounSet = new HashSet<string>();
            var readPronoun = new ReadModel(PronounList);
            foreach (var line in readPronoun.GetNextLine())
            {
                var word = line.ToLowerInvariant().Trim();
                if (string.IsNullOrEmpty(word))
                {
                    continue;
                }
                PronounSet.Add(word);
            }

            ConjunctionSet = new HashSet<string>();
            var readConjunction = new ReadModel(ConjuctionList);
            foreach (var line in readConjunction.GetNextLine())
            {
                var word = line.ToLowerInvariant().Trim();
                if (string.IsNullOrEmpty(word))
                {
                    continue;
                }
                ConjunctionSet.Add(word);
            }

            VerbSet = new HashSet<string>();
            var readVerb = new ReadModel(VerbList);
            foreach (var line in readVerb.GetNextLine())
            {
                var word = line.ToLowerInvariant().Trim();
                if (string.IsNullOrEmpty(word))
                {
                    continue;
                }
                VerbSet.Add(word);
            }

            ArticleSet = new HashSet<string>();
            var readArticle = new ReadModel(ArticleList);
            foreach (var line in readArticle.GetNextLine())
            {
                var word = line.ToLowerInvariant().Trim();
                if (string.IsNullOrEmpty(word))
                {
                    continue;
                }
                ArticleSet.Add(word);
            }

            PrepositionSet = new HashSet<string>();
            var readPreposition = new ReadModel(PrepositionList);
            foreach (var line in readPreposition.GetNextLine())
            {
                var word = line.ToLowerInvariant().Trim();
                if (string.IsNullOrEmpty(word))
                {
                    continue;
                }
                PrepositionSet.Add(word);
            }

            SuffixSet = new HashSet<string>();
            var readSuffix = new ReadModel(SuffixList);
            foreach (var line in readSuffix.GetNextLine())
            {
                var word = line.ToLowerInvariant().Trim();
                if (string.IsNullOrEmpty(word))
                {
                    continue;
                }
                SuffixSet.Add(word);
            }

            AdjectiveSet = new HashSet<string>();
            var readAdjective = new ReadModel(AdjectiveList);
            foreach (var line in readAdjective.GetNextLine())
            {
                var word = line.ToLowerInvariant().Trim();
                if (string.IsNullOrEmpty(word))
                {
                    continue;
                }
                AdjectiveSet.Add(word);
            }
        }