예제 #1
0
        private static void readNamedEntities(string file, ref List <Entity> nes, bool additionalAnnotation)
        {
            string text = DataStructReader.readWholeTextFile(file, Encoding.UTF8);

            Regex regex = new Regex(
                "<.+?MEX TYPE=\"(?<type>.+?)\">(?<occ>.+?)</.+?MEX>",
                RegexOptions.Singleline
                );

            Match m = regex.Match(text);

            while (m.Success)
            {
                Entity ent = new Entity();
                ent.type = m.Groups["type"].Value;
                if (!additionalAnnotation)
                {
                    ent.occurence = m.Groups["occ"].Value;
                }
                else
                {
                    ent.occurence = stripAnnotation(m.Groups["occ"].Value);
                }
                nes.Add(ent);
                m = m.NextMatch();
            }
        }
예제 #2
0
        private static void getUserCounts(ref Dictionary <string, double> counts, ref Dictionary <string, Dictionary <string, int> > singleOccurences, string fileName, StreamWriter wrt, ref double total, string lang, bool alreadyProcessed)
        {
            string text = DataStructReader.readWholeTextFile(fileName, Encoding.UTF8);

            if (!alreadyProcessed)
            {
                string[] parts = text.Split('\n');
                for (int i = 0; i < parts.Length; i++)
                {
                    string[] xmlPieces = null;

                    xmlPieces = preprocess(parts[i], lang).Trim().Split('\n');

                    foreach (string xmlText in xmlPieces)
                    {
                        process(xmlText, ref counts, ref singleOccurences, ref total, wrt);
                    }
                    Console.Write("\r{0}%   ", 100 * (i + 1) / parts.Length);
                }
            }
            else
            {
                Regex regex = new Regex(
                    "<seg lang=\"" + lang + "\">.+?</seg>",
                    RegexOptions.Singleline
                    );
                Match m = regex.Match(text);
                while (m.Success)
                {
                    process(m.Value, ref counts, ref singleOccurences, ref total, wrt);
                    m = m.NextMatch();
                }
            }
        }
예제 #3
0
        private static void readTerminology(string file, ref List <string> nes, bool additionalAnnotation)
        {
            string text = DataStructReader.readWholeTextFile(file, Encoding.UTF8);
            Dictionary <string, bool> unique = new Dictionary <string, bool>();
            Regex regex = new Regex(
                "<TENAME>(?<occ>.+?)</TENAME>",
                RegexOptions.Singleline
                );

            Match m = regex.Match(text);

            while (m.Success)
            {
                string occ = m.Groups["occ"].Value.ToLower();
                if (additionalAnnotation)
                {
                    occ = stripAnnotation(occ);
                }
                if (!unique.ContainsKey(occ))
                {
                    nes.Add(occ);
                    unique.Add(occ, true);
                }
                m = m.NextMatch();
            }
        }
예제 #4
0
        private static void terminology(string languageModelFile, string inputFile, bool keepIntermedyaryFiles, string lang, bool alreadyProcessed)
        {
            List <string> terms = new List <string>();

            //HashSet<string> generalTerms = new HashSet<string>();
            //if (lang == "ro")
            //{
            //    generalTerms = DataStructReader.readHashSet("gtRo.txt", Encoding.UTF8, 0, '\t', true, null);
            //}
            //else if (lang == "en")
            //{
            //    generalTerms = DataStructReader.readHashSet("gtEn.txt", Encoding.UTF8, 0, '\t', true, null);
            //}

            Dictionary <string, double> ncounts = DataStructReader.readDictionaryD(languageModelFile, Encoding.UTF8, 0, 1, '\t', false, null, null);

            if (ncounts.Count == 0)
            {
                Console.WriteLine("Language Model Missing... Press key for aborting!");
                Console.ReadLine();
            }
            else
            {
                Dictionary <string, double> userCounts = new Dictionary <string, double>();
                double total = 0;

                if (!File.Exists(inputFile))
                {
                    Console.WriteLine("Input File doesn't exist... Press key for aborting!");
                    Console.ReadLine();
                }
                else
                {
                    Dictionary <string, string> fileCorrespondences = new Dictionary <string, string>();
                    string       line = "";
                    StreamReader rdr  = new StreamReader(inputFile, Encoding.UTF8);
                    while ((line = rdr.ReadLine()) != null)
                    {
                        string[] parts = line.Trim().Split('\t');
                        if (!fileCorrespondences.ContainsKey(parts[0]))
                        {
                            fileCorrespondences.Add(parts[0], parts[1]);
                        }
                    }

                    string[] files = fileCorrespondences.Keys.ToArray();
                    Dictionary <string, Dictionary <string, int> > singleOccurencesFirst = new Dictionary <string, Dictionary <string, int> >();
                    StreamWriter wrtProcessed = new StreamWriter("_preprocessed", false, Encoding.UTF8);
                    wrtProcessed.AutoFlush = true;

                    foreach (string file in files)
                    {
                        if (alreadyProcessed)
                        {
                            Console.Write("\nReading file: {0}", file);
                        }
                        else
                        {
                            Console.WriteLine("\nProcessing file: {0}", file);
                        }
                        getUserCounts(ref userCounts, ref singleOccurencesFirst, file, wrtProcessed, ref total, lang, alreadyProcessed);
                        //Console.WriteLine(" ... done!");
                    }
                    wrtProcessed.Close();

                    Console.Write("Extracting single word terms");

                    foreach (string key in userCounts.Keys.ToArray())
                    {
                        if (userCounts[key] < 2 /*|| generalTerms.Contains(key)*/)
                        {
                            userCounts.Remove(key);
                        }
                        else
                        {
                            userCounts[key] = userCounts[key] / total;
                        }
                    }

                    Dictionary <string, List <string> > singleOccurences = getSingle(singleOccurencesFirst);

                    Dictionary <string, double> results = new Dictionary <string, double>();
                    foreach (string word in userCounts.Keys)
                    {
                        double newVal = 0;

                        if (ncounts.ContainsKey(word))
                        {
                            newVal = userCounts[word] / ncounts[word];
                        }
                        else
                        {
                            newVal = userCounts[word] / ncounts["_dummy_"];
                        }

                        results.Add(word, newVal);
                    }

                    string[] keys   = results.Keys.ToArray();
                    double[] values = results.Values.ToArray();

                    Array.Sort(values, keys);

                    StreamWriter wrt = new StreamWriter("_monoTerms", false, Encoding.UTF8);
                    wrt.AutoFlush = true;
                    for (int i = keys.Length - 1; i >= 0; i--)
                    {
                        wrt.WriteLine("{0}\t{1}", keys[i], values[i]);
                    }
                    wrt.Close();

                    Console.WriteLine(" ... done!");

                    Console.Write("Extracting multi word terms");

                    ColocationExtractor ce = new ColocationExtractor();
                    Dictionary <string, List <string> > multiOccurences = new Dictionary <string, List <string> >();

                    if (ce.extractCollocations("_preprocessed", "_multiTerms"))
                    {
                        Console.WriteLine(" ... done!");

                        Console.Write("Create index for extracting exact occurences");

                        if (!Directory.Exists("_index"))
                        {
                            Directory.CreateDirectory("_index");
                        }
                        ce.indexText("_preprocessed", "_index");
                        Console.WriteLine(" ... done!");

                        Console.Write("Search for exact occurences");
                        Lucene.Net.Search.IndexSearcher searcher = new Lucene.Net.Search.IndexSearcher("_index");
                        multiOccurences = retriveOccurences(ce, searcher, "_multiTerms");
                        Console.WriteLine(" ... done!");

                        searcher.Close();
                        string[] filesToDel = Directory.GetFiles("_index");
                        foreach (string f in filesToDel)
                        {
                            File.Delete(f);
                        }
                        Directory.Delete("_index");
                    }
                    else
                    {
                        Console.WriteLine(" ... done! - no multi word terms found!");
                    }


                    Console.Write("Retrieving terminology");

                    terms = extractTerminology("_monoTerms", "_multiTerms", singleOccurences, multiOccurences);

                    if (keepIntermedyaryFiles)
                    {
                        StreamWriter wrtT = new StreamWriter("_terminology", false, Encoding.UTF8);
                        wrtT.AutoFlush = true;
                        foreach (string term in terms)
                        {
                            wrtT.WriteLine(term);
                        }
                        wrtT.Close();
                    }

                    Console.WriteLine(" ... done!");

                    HashSet <string> mono = new HashSet <string>();
                    Dictionary <string, HashSet <string> > multi = new Dictionary <string, HashSet <string> >();
                    HashSet <string> multiOrg = new HashSet <string>();

                    getTerms(terms, ref mono, ref multi, ref multiOrg);
                    markTerms(lang, fileCorrespondences, mono, multi, multiOrg, alreadyProcessed);

                    if (!keepIntermedyaryFiles)
                    {
                        File.Delete("_preprocessed");
                        File.Delete("_monoTerms");
                        File.Delete("_multiTerms");
                    }
                }
            }
        }
예제 #5
0
        private static void markTerms(string lang, Dictionary <string, string> fileCorrespondences, HashSet <string> mono, Dictionary <string, HashSet <string> > multi, HashSet <string> multiOrg, bool alreadyProcessed)
        {
            foreach (string file in fileCorrespondences.Keys)
            {
                Console.Write("Annotating {0}... ", Path.GetFileName(file));

                StreamWriter wrt = new StreamWriter(fileCorrespondences[file], false, Encoding.UTF8);
                wrt.AutoFlush = true;

                if (!alreadyProcessed)
                {
                    //textul NU e deja preprocesat
                    StreamReader rdr  = new StreamReader(file, Encoding.UTF8);
                    string       line = "";

                    Regex         regex  = new Regex("(?<word>[\\w-]+)|(?<char>.)", RegexOptions.None);
                    StringBuilder sb     = new StringBuilder();
                    bool          inside = false;

                    while ((line = rdr.ReadLine()) != null)
                    {
                        Match m = regex.Match(line);
                        while (m.Success)
                        {
                            string val = m.Groups["word"].Value;
                            if (val != "" && (mono.Contains(val.ToLower()) || multi.ContainsKey(val.ToLower()) || inside))
                            {
                                if (!inside)
                                {
                                    if (!multi.ContainsKey(val.ToLower()))
                                    {
                                        wrt.Write("<TENAME>" + val + "</TENAME>");
                                    }
                                    else
                                    {
                                        sb.Append("<TENAME>" + val);
                                        inside = true;
                                    }
                                }
                                else
                                {
                                    string key = sb.ToString().Substring(9).ToLower().Trim();
                                    if (multi.ContainsKey(key) && multi[key].Contains(val.ToLower()))
                                    {
                                        sb.Append(val);
                                    }
                                    else
                                    {
                                        wrt.Write(sb.ToString().Trim() + "</TENAME>");
                                        wrt.Write(" " + val);
                                        inside = false;
                                        sb     = new StringBuilder();
                                    }
                                }
                            }
                            else
                            {
                                if (!inside)
                                {
                                    wrt.Write(m.Value);
                                }
                                else
                                {
                                    if (m.Value != " ")
                                    {
                                        wrt.Write(sb.ToString() + "</TENAME>");
                                        wrt.Write(m.Value);
                                        inside = false;
                                        sb     = new StringBuilder();
                                    }
                                    else
                                    {
                                        sb.Append(" ");
                                    }
                                }
                            }

                            m = m.NextMatch();
                        }
                        wrt.WriteLine();
                    }

                    rdr.Close();
                }
                else
                {
                    //textul e deja preprocesat
                    string text  = DataStructReader.readWholeTextFile(file, Encoding.UTF8);
                    Regex  regex = new Regex(
                        "<seg lang=\"" + lang + "\">.+?</seg>",
                        RegexOptions.Singleline
                        );

                    Match m = regex.Match(text);

                    while (m.Success)
                    {
                        try
                        {
                            XmlDocument xdoc = new XmlDocument();
                            xdoc.LoadXml("<!DOCTYPE root [<!ENTITY % SGMLUniq SYSTEM \"sgmlunic.ent\"> %SGMLUniq;]>\n<root>" + m.Value.Replace("", "").Replace("\x01", "").Replace("\x1B", "").Replace("&b.theta;", "&b.Theta;") + "</root>");
                            XmlNodeList list = xdoc.SelectNodes("//w|//c");

                            StringBuilder sb       = new StringBuilder();
                            bool          inside   = false;
                            StringBuilder sentence = new StringBuilder();
                            string        firstPos = "";

                            foreach (XmlNode node in list)
                            {
                                bool alreadyAdded = false;

                                if (node.Name == "w")
                                {
                                    string val = node.InnerText.Replace("_", " ");
                                    string pos = node.Attributes["ana"].InnerText.Substring(0, 1).ToLower();

                                    if (val != "" && (mono.Contains(val.ToLower()) || multi.ContainsKey(val.ToLower()) || inside))
                                    {
                                        if (!inside)
                                        {
                                            if (pos == "n" || pos == "a")
                                            {
                                                if (!multi.ContainsKey(val.ToLower()))
                                                {
                                                    if (pos == "n")
                                                    {
                                                        sentence.Append(" <TENAME>" + val + "</TENAME>");
                                                    }
                                                }
                                                else
                                                {
                                                    sb.Append(" <TENAME>" + val);
                                                    firstPos = pos;
                                                    inside   = true;
                                                }
                                            }
                                            else
                                            {
                                                sentence.Append(" " + val);
                                            }
                                        }
                                        else
                                        {
                                            string key = sb.ToString().Trim().Substring(8).ToLower().Trim();
                                            if (multi.ContainsKey(key) && multi[key].Contains(val.ToLower()))
                                            {
                                                sb.Append(" " + val);
                                            }
                                            else
                                            {
                                                string toAdd = sb.ToString().Trim().Substring(8).Trim();
                                                if (multiOrg.Contains(toAdd.ToLower()))
                                                {
                                                    sentence.Append(" <TENAME>" + toAdd + "</TENAME>");
                                                }
                                                else
                                                {
                                                    int idx = toAdd.IndexOf(' ');

                                                    if (idx != -1)
                                                    {
                                                        string first = toAdd.Substring(0, idx);
                                                        string rest  = toAdd.Substring(idx + 1);

                                                        if (mono.Contains(first.ToLower()) && firstPos == "n")
                                                        {
                                                            sentence.Append(" <TENAME>" + first + "</TENAME> " + rest);
                                                        }
                                                        else
                                                        {
                                                            sentence.Append(" " + toAdd);
                                                        }
                                                    }
                                                    else if (firstPos == "n")
                                                    {
                                                        sentence.Append(" <TENAME>" + toAdd + "</TENAME>");
                                                    }
                                                    else
                                                    {
                                                        sentence.Append(" " + toAdd);
                                                    }
                                                }

                                                sentence.Append(" " + val);
                                                inside = false;
                                                sb     = new StringBuilder();
                                            }
                                        }

                                        alreadyAdded = true;
                                    }
                                }

                                if (!inside)
                                {
                                    if (!alreadyAdded)
                                    {
                                        sentence.Append(" " + node.InnerText.Replace("_", " "));
                                    }
                                }
                                else
                                {
                                    if (!alreadyAdded)
                                    {
                                        if (m.Value != " ")
                                        {
                                            sentence.Append(" " + sb.ToString().Trim() + "</TENAME>");
                                            sentence.Append(" " + node.InnerText.Replace("_", " "));
                                            inside = false;
                                            sb     = new StringBuilder();
                                        }
                                        else
                                        {
                                            sb.Append(" ");
                                        }
                                    }
                                }
                            }
                            wrt.WriteLine(sentence.ToString().Trim());
                        }
                        catch
                        {
                        }
                        m = m.NextMatch();
                    }
                }
                wrt.Close();

                Console.WriteLine("done.");
            }
        }