Example #1
0
        /// <summary> It reads the morpheme dictionary file, and initializes the trie structure.</summary>
        /// <param name="dictionaryFileName">- the file path of the morpheme dictionary
        /// </param>
        /// <param name="tagSet">- the morpheme tag set
        /// </param>
        /// <throws>  IOException </throws>
        public virtual void  read_dic(System.String dictionaryFileName, TagSet tagSet)
        {
            System.String str = "";

            System.IO.StreamReader in_Renamed = new System.IO.StreamReader(
                new System.IO.FileStream(dictionaryFileName, System.IO.FileMode.Open, System.IO.FileAccess.Read),
                System.Text.Encoding.UTF8);
            INFO[] info_list = new INFO[255];
            for (int i = 0; i < 255; i++)
            {
                info_list[i] = new INFO(this);
            }

            while ((str = in_Renamed.ReadLine()) != null)
            {
                str.Trim();
                if (str.Equals(""))
                {
                    continue;
                }

                StringTokenizer tok   = new StringTokenizer(str, "\t ");
                System.String   word  = tok.NextToken;
                int             isize = 0;

                while (tok.HasMoreTokens)
                {
                    System.String   data = tok.NextToken;
                    StringTokenizer tok2 = new StringTokenizer(data, ".");
                    System.String   curt = tok2.NextToken;
                    int             x    = tagSet.getTagID(curt);
                    if (x == -1)
                    {
                        System.Console.Error.WriteLine("read_dic:tag error");
                        continue;
                    }

                    if (tok2.HasMoreTokens)
                    {
                        info_list[isize].phoneme = (short)tagSet.getIrregularID(tok2.NextToken);
                    }
                    else
                    {
                        info_list[isize].phoneme = TagSet.PHONEME_TYPE_ALL;
                    }

                    info_list[isize].tag = x;
                    isize++;
                }
                info_list[isize].tag     = 0;
                info_list[isize].phoneme = 0;

                char[] word3 = Code.toTripleArray(word);
                for (int i = 0; i < isize; i++)
                {
                    store(word3, info_list[i]);
                }
            }
        }
Example #2
0
        /// <summary> Checks whether two morpheme tags can appear consecutively.</summary>
        /// <param name="tagSet">- morpheme tag set
        /// </param>
        /// <param name="tag1">- the first morpheme tag to check
        /// </param>
        /// <param name="tag2">- the second morpheme tag to check
        /// </param>
        /// <param name="len1">- the length of the first morpheme
        /// </param>
        /// <param name="len2">- the length of the second morpheme
        /// </param>
        /// <param name="typeOfTag2">- the tag type of the second morpheme tag
        /// </param>
        /// <returns> true: the two consecutive morpheme tags can appear, false: they cannot appear
        /// </returns>
        public virtual bool checkConnection(TagSet tagSet, int tag1, int tag2, int len1, int len2, int typeOfTag2)
        {
            System.String tag1Name = tagSet.getTagName(tag1);
            System.String tag2Name = tagSet.getTagName(tag2);

            if ((tag1Name.StartsWith("nc") || tag1Name[0] == 'f') && tag2Name[0] == 'n')
            {
                if (tag2Name.StartsWith("nq"))
                {
                    return(false);
                }
                else if (len1 < 4 || len2 < 2)
                {
                    return(false);
                }
            }

            //		System.err.println(tag1Name + "\t" + tag2Name + ": " + connectionTable[tag1][tag2] + " " + tagSet.checkTagType(nextTagType, tag2));
            return(connectionTable[tag1][tag2] && tagSet.checkTagType(typeOfTag2, tag2));
        }
Example #3
0
 /// <summary> It writes the data in trie structure to the specified file.</summary>
 /// <param name="tagSet">- the morpheme tag set used in the trie structure
 /// </param>
 public virtual void  print_result(TagSet tagSet)
 {
     try
     {
         System.IO.StreamWriter pw = new StreamWriter("./data/kE/output.txt");
         for (int k = 0; k < node_head.child_size; k++)
         {
             print_trie(pw, node_head.child_idx + k, 0, tagSet);
         }
         for (int ii = free_head.next_idx; ii != 0; ii = trie_buf[ii].free.next_idx)
         {
             pw.Write("[n:" + ii + " s:" + trie_buf[ii].free.size + "] ");
         }
         pw.WriteLine();
         pw.Flush();
         pw.Close();
     }
     catch (System.IO.FileNotFoundException e)
     {
         SupportClass.WriteStackTrace(e, Console.Error);
     }
 }
Example #4
0
        /// <summary> Initializes the Chart-based Morphological Analyzer plug-in.</summary>
        /// <param name="baseDir">- the path for base directory, which should have the 'conf' and 'data' directory
        /// </param>
        /// <param name="configFile">- the path for the configuration file (relative path to the base directory)
        /// </param>
        public virtual void  initialize(System.String baseDir, System.String configFile)
        {
            JSONReader json = new JSONReader(configFile);

            fileDicSystem      = baseDir + "/" + json.getValue("dic_system");
            fileDicUser        = baseDir + "/" + json.getValue("dic_user");
            fileConnections    = baseDir + "/" + json.getValue("connections");
            fileConnectionsNot = baseDir + "/" + json.getValue("connections_not");
            fileDicAnalyzed    = baseDir + "/" + json.getValue("dic_analyzed");
            fileTagSet         = baseDir + "/" + json.getValue("tagset");

            tagSet = new TagSet();
            tagSet.init(fileTagSet, TagSet.TAG_SET_KAIST);

            connection = new Connection();
            connection.init(fileConnections, tagSet.TagCount, tagSet);

            connectionNot = new ConnectionNot();
            connectionNot.init(fileConnectionsNot, tagSet);

            analyzedDic = new AnalyzedDic();
            analyzedDic.readDic(fileDicAnalyzed);

            systemDic = new Trie(Trie.DEFAULT_TRIE_BUF_SIZE_SYS);
            systemDic.read_dic(fileDicSystem, tagSet);

            userDic = new Trie(Trie.DEFAULT_TRIE_BUF_SIZE_USER);
            userDic.read_dic(fileDicUser, tagSet);

            numDic = new NumberDic();
            simti  = new Simti();
            simti.init();
            eojeolList = new LinkedList <Eojeol>();

            chart = new MorphemeChart(tagSet, connection, systemDic, userDic, numDic, simti, eojeolList);

            postProc = new PostProcessor();
        }
Example #5
0
        /// <summary> Constructor.</summary>
        /// <param name="tagSet">- the morpheme tag set
        /// </param>
        /// <param name="connection">- the morpheme connection rules
        /// </param>
        /// <param name="systemDic">- the system morpheme dictionary
        /// </param>
        /// <param name="userDic">- the user morpheme dictionary
        /// </param>
        /// <param name="numDic">- the number dictionary
        /// </param>
        /// <param name="simti">- the SIMple Trie Index
        /// </param>
        /// <param name="resEojeolList">- the list of eojeols to store the analysis result
        /// </param>
        public MorphemeChart(TagSet tagSet, Connection connection, Trie systemDic, Trie userDic, NumberDic numDic, Simti simti, LinkedList <Eojeol> resEojeolList)
        {
            chart = new Morpheme[MAX_MORPHEME_CHART];
            for (int i = 0; i < MAX_MORPHEME_CHART; i++)
            {
                chart[i] = new Morpheme(this);
            }

            this.sp         = new SegmentPosition();
            this.tagSet     = tagSet;
            this.connection = connection;
            this.exp        = new Exp(this, tagSet);
            this.systemDic  = systemDic;
            this.userDic    = userDic;
            this.numDic     = numDic;
            this.simti      = simti;
            this.resEojeols = resEojeolList;

            resMorphemes = new List <String>();
            resTags      = new List <String>();

            chiReplacementList = new LinkedList <String>();
            engReplacementList = new LinkedList <String>();
        }
Example #6
0
        /// <summary> It reads the morpheme dictionary file, and initializes the trie structure.</summary>
        /// <param name="dictionaryFileName">- the file path of the morpheme dictionary
        /// </param>
        /// <param name="tagSet">- the morpheme tag set
        /// </param>
        /// <throws>  IOException </throws>
        public virtual void read_dic(System.String dictionaryFileName, TagSet tagSet)
        {
            System.String str = "";

            System.IO.StreamReader in_Renamed = new System.IO.StreamReader(
                new System.IO.FileStream(dictionaryFileName, System.IO.FileMode.Open, System.IO.FileAccess.Read),
                System.Text.Encoding.UTF8);
            INFO[] info_list = new INFO[255];
            for (int i = 0; i < 255; i++)
            {
                info_list[i] = new INFO(this);
            }

            while ((str = in_Renamed.ReadLine()) != null)
            {
                str.Trim();
                if (str.Equals(""))
                {
                    continue;
                }

                StringTokenizer tok = new StringTokenizer(str, "\t ");
                System.String word = tok.NextToken;
                int isize = 0;

                while (tok.HasMoreTokens)
                {
                    System.String data = tok.NextToken;
                    StringTokenizer tok2 = new StringTokenizer(data, ".");
                    System.String curt = tok2.NextToken;
                    int x = tagSet.getTagID(curt);
                    if (x == - 1)
                    {
                        System.Console.Error.WriteLine("read_dic:tag error");
                        continue;
                    }

                    if (tok2.HasMoreTokens)
                    {
                        info_list[isize].phoneme = (short) tagSet.getIrregularID(tok2.NextToken);
                    }
                    else
                    {
                        info_list[isize].phoneme = TagSet.PHONEME_TYPE_ALL;
                    }

                    info_list[isize].tag = x;
                    isize++;
                }
                info_list[isize].tag = 0;
                info_list[isize].phoneme = 0;

                char[] word3 = Code.toTripleArray(word);
                for (int i = 0; i < isize; i++)
                {
                    store(word3, info_list[i]);
                }
            }
        }
Example #7
0
 /// <summary> It prints the trie structure by recursive call.</summary>
 /// <param name="pw">- for printing the trie structure
 /// </param>
 /// <param name="idx">- the index of trie node
 /// </param>
 /// <param name="depth">- the depth of current node
 /// </param>
 /// <param name="tagSet">- the morpheme tag set used in the trie structure
 /// </param>
 public virtual void print_trie(System.IO.StreamWriter pw, int idx, int depth, TagSet tagSet)
 {
     for (int i = 0; i < depth; i++)
     {
         pw.Write("\t");
     }
     pw.Write(idx + ":" + Code.toCompatibilityJamo(trie_buf[idx].key) + " ");
     if (trie_buf[idx].info_list != null)
     {
         for (int k = 0; k < trie_buf[idx].info_list.Count; k++)
         {
             pw.Write("t:" + tagSet.getTagName(trie_buf[idx].info_list.Get_Renamed(k).tag) + " ");
         }
     }
     pw.WriteLine();
     for (int i = 0; i < trie_buf[idx].child_size; i++)
     {
         print_trie(pw, trie_buf[idx].child_idx + i, depth + 1, tagSet);
     }
 }
Example #8
0
 /// <summary> It writes the data in trie structure to the specified file.</summary>
 /// <param name="tagSet">- the morpheme tag set used in the trie structure
 /// </param>
 public virtual void print_result(TagSet tagSet)
 {
     try
     {
         System.IO.StreamWriter pw = new StreamWriter("./data/kE/output.txt");
         for (int k = 0; k < node_head.child_size; k++)
         {
             print_trie(pw, node_head.child_idx + k, 0, tagSet);
         }
         for (int ii = free_head.next_idx; ii != 0; ii = trie_buf[ii].free.next_idx)
         {
             pw.Write("[n:" + ii + " s:" + trie_buf[ii].free.size + "] ");
         }
         pw.WriteLine();
         pw.Flush();
         pw.Close();
     }
     catch (System.IO.FileNotFoundException e)
     {
         SupportClass.WriteStackTrace(e, Console.Error);
     }
 }
Example #9
0
        /// <summary> Constructor.</summary>
        /// <param name="tagSet">- the morpheme tag set
        /// </param>
        /// <param name="connection">- the morpheme connection rules
        /// </param>
        /// <param name="systemDic">- the system morpheme dictionary
        /// </param>
        /// <param name="userDic">- the user morpheme dictionary
        /// </param>
        /// <param name="numDic">- the number dictionary
        /// </param>
        /// <param name="simti">- the SIMple Trie Index
        /// </param>
        /// <param name="resEojeolList">- the list of eojeols to store the analysis result
        /// </param>
        public MorphemeChart(TagSet tagSet, Connection connection, Trie systemDic, Trie userDic, NumberDic numDic, Simti simti, LinkedList<Eojeol> resEojeolList)
        {
            chart = new Morpheme[MAX_MORPHEME_CHART];
            for (int i = 0; i < MAX_MORPHEME_CHART; i++)
            {
                chart[i] = new Morpheme(this);
            }

            this.sp = new SegmentPosition();
            this.tagSet = tagSet;
            this.connection = connection;
            this.exp = new Exp(this, tagSet);
            this.systemDic = systemDic;
            this.userDic = userDic;
            this.numDic = numDic;
            this.simti = simti;
            this.resEojeols = resEojeolList;

            resMorphemes = new List<String>();
            resTags = new List<String>();

            chiReplacementList = new LinkedList<String>();
            engReplacementList = new LinkedList<String>();
        }
Example #10
0
        /// <summary> Reads the connection rule data file, and initialize the object.</summary>
        /// <param name="filePath">- the path for the connection rule file
        /// </param>
        /// <param name="tagCount">- the number of total tags in the tag set
        /// </param>
        /// <param name="tagSet">- the tag set which is used in the connection rules
        /// </param>
        /// <throws>  IOException </throws>
        private void readFile(System.String filePath, int tagCount, TagSet tagSet)
        {
            System.IO.StreamReader br = new System.IO.StreamReader(
                new System.IO.FileStream(filePath, System.IO.FileMode.Open, System.IO.FileAccess.Read),
                System.Text.Encoding.UTF8);
            System.String line = null;
            HashSet<int> tagSetA = new HashSet<int>();
            HashSet<int> tagSetB = new HashSet<int>();

            title = "";
            version = "";
            copyright = "";
            author = "";
            date = "";
            editor = "";
            startTag = "";
            connectionTable = new bool[tagCount][];
            for (int i = 0; i < tagCount; i++)
            {
                connectionTable[i] = new bool[tagCount];
            }

            for (int i = 0; i < tagCount; i++)
            {
                for (int j = 0; j < tagCount; j++)
                {
                    connectionTable[i][j] = false;
                }
            }

            while ((line = br.ReadLine()) != null)
            {
                StringTokenizer lineTokenizer = new StringTokenizer(line, "\t");
                if (lineTokenizer.HasMoreTokens == false)
                {
                    continue;
                }

                System.String lineToken = lineTokenizer.NextToken;

                if (lineToken.StartsWith("@"))
                {
                    if ("@title".Equals(lineToken))
                    {
                        title = lineTokenizer.NextToken;
                    }
                    else if ("@version".Equals(lineToken))
                    {
                        version = lineTokenizer.NextToken;
                    }
                    else if ("@copyright".Equals(lineToken))
                    {
                        copyright = lineTokenizer.NextToken;
                    }
                    else if ("@author".Equals(lineToken))
                    {
                        author = lineTokenizer.NextToken;
                    }
                    else if ("@date".Equals(lineToken))
                    {
                        date = lineTokenizer.NextToken;
                    }
                    else if ("@editor".Equals(lineToken))
                    {
                        editor = lineTokenizer.NextToken;
                    }
                }
                else if ("CONNECTION".Equals(lineToken))
                {
                    lineToken = lineTokenizer.NextToken;
                    System.String[] tagLists = lineToken.Split("\\*", 2);

                    StringTokenizer tagTokenizer = new StringTokenizer(tagLists[0], ",()");
                    while (tagTokenizer.HasMoreTokens)
                    {
                        System.String tagToken = tagTokenizer.NextToken;

                        StringTokenizer tok = new StringTokenizer(tagToken, "-");
                        while (tok.HasMoreTokens)
                        {
                            System.String t = tok.NextToken;
                            int[] fullTagIDSet = tagSet.getTags(t);

                            if (fullTagIDSet != null)
                            {
                                for (int i = 0; i < fullTagIDSet.Length; i++)
                                {
                                    tagSetA.Add(fullTagIDSet[i]);
                                }
                            }
                            else
                            {
                                tagSetA.Add(tagSet.getTagID(t));
                            }
                            while (tok.HasMoreTokens)
                            {
                                tagSetA.Remove(tagSet.getTagID(tok.NextToken));
                            }
                        }
                    }

                    tagTokenizer = new StringTokenizer(tagLists[1], ",()");
                    while (tagTokenizer.HasMoreTokens)
                    {
                        System.String tagToken = tagTokenizer.NextToken;

                        StringTokenizer tok = new StringTokenizer(tagToken, "-");
                        while (tok.HasMoreTokens)
                        {
                            System.String t = tok.NextToken;
                            int[] fullTagIDSet = tagSet.getTags(t);

                            if (fullTagIDSet != null)
                            {
                                for (int i = 0; i < fullTagIDSet.Length; i++)
                                {
                                    tagSetB.Add(fullTagIDSet[i]);
                                }
                            }
                            else
                            {
                                tagSetB.Add(tagSet.getTagID(t));
                            }
                            while (tok.HasMoreTokens)
                            {
                                tagSetB.Remove(tagSet.getTagID(tok.NextToken));
                            }
                        }
                    }

                    IEnumerator<int> iterA = tagSetA.GetEnumerator();
                    while (iterA.MoveNext())
                    {
                        int leftSide = iterA.Current;
                        IEnumerator<int> iterB = tagSetB.GetEnumerator();

                        while (iterB.MoveNext())
                        {
                            connectionTable[leftSide][iterB.Current] = true;
                        }
                    }

                    tagSetA.Clear();
                    tagSetB.Clear();
                }
                else if ("START_TAG".Equals(lineToken))
                {
                    startTag = lineTokenizer.NextToken;
                }
            }
            br.Close();
        }
Example #11
0
 /// <summary> Constructor.</summary>
 /// <param name="mc">- the lattice style morpheme chart
 /// </param>
 /// <param name="tagSet">- morpheme tag set
 /// </param>
 public Exp(MorphemeChart mc, TagSet tagSet)
 {
     this.mc = mc;
     this.tagSet = tagSet;
     pset_end = pset.Length;
 }
Example #12
0
 /// <summary> It prints the trie structure by recursive call.</summary>
 /// <param name="pw">- for printing the trie structure
 /// </param>
 /// <param name="idx">- the index of trie node
 /// </param>
 /// <param name="depth">- the depth of current node
 /// </param>
 /// <param name="tagSet">- the morpheme tag set used in the trie structure
 /// </param>
 public virtual void  print_trie(System.IO.StreamWriter pw, int idx, int depth, TagSet tagSet)
 {
     for (int i = 0; i < depth; i++)
     {
         pw.Write("\t");
     }
     pw.Write(idx + ":" + Code.toCompatibilityJamo(trie_buf[idx].key) + " ");
     if (trie_buf[idx].info_list != null)
     {
         for (int k = 0; k < trie_buf[idx].info_list.Count; k++)
         {
             pw.Write("t:" + tagSet.getTagName(trie_buf[idx].info_list.Get_Renamed(k).tag) + " ");
         }
     }
     pw.WriteLine();
     for (int i = 0; i < trie_buf[idx].child_size; i++)
     {
         print_trie(pw, trie_buf[idx].child_idx + i, depth + 1, tagSet);
     }
 }
Example #13
0
        /// <summary> Reads the connection rule data file, and initialize the object.</summary>
        /// <param name="filePath">- the path for the connection rule file
        /// </param>
        /// <param name="tagCount">- the number of total tags in the tag set
        /// </param>
        /// <param name="tagSet">- the tag set which is used in the connection rules
        /// </param>
        /// <throws>  IOException </throws>
        private void  readFile(System.String filePath, int tagCount, TagSet tagSet)
        {
            System.IO.StreamReader br = new System.IO.StreamReader(
                new System.IO.FileStream(filePath, System.IO.FileMode.Open, System.IO.FileAccess.Read),
                System.Text.Encoding.UTF8);
            System.String line    = null;
            HashSet <int> tagSetA = new HashSet <int>();
            HashSet <int> tagSetB = new HashSet <int>();

            title           = "";
            version         = "";
            copyright       = "";
            author          = "";
            date            = "";
            editor          = "";
            startTag        = "";
            connectionTable = new bool[tagCount][];
            for (int i = 0; i < tagCount; i++)
            {
                connectionTable[i] = new bool[tagCount];
            }

            for (int i = 0; i < tagCount; i++)
            {
                for (int j = 0; j < tagCount; j++)
                {
                    connectionTable[i][j] = false;
                }
            }

            while ((line = br.ReadLine()) != null)
            {
                StringTokenizer lineTokenizer = new StringTokenizer(line, "\t");
                if (lineTokenizer.HasMoreTokens == false)
                {
                    continue;
                }

                System.String lineToken = lineTokenizer.NextToken;

                if (lineToken.StartsWith("@"))
                {
                    if ("@title".Equals(lineToken))
                    {
                        title = lineTokenizer.NextToken;
                    }
                    else if ("@version".Equals(lineToken))
                    {
                        version = lineTokenizer.NextToken;
                    }
                    else if ("@copyright".Equals(lineToken))
                    {
                        copyright = lineTokenizer.NextToken;
                    }
                    else if ("@author".Equals(lineToken))
                    {
                        author = lineTokenizer.NextToken;
                    }
                    else if ("@date".Equals(lineToken))
                    {
                        date = lineTokenizer.NextToken;
                    }
                    else if ("@editor".Equals(lineToken))
                    {
                        editor = lineTokenizer.NextToken;
                    }
                }
                else if ("CONNECTION".Equals(lineToken))
                {
                    lineToken = lineTokenizer.NextToken;
                    System.String[] tagLists = lineToken.Split("\\*", 2);

                    StringTokenizer tagTokenizer = new StringTokenizer(tagLists[0], ",()");
                    while (tagTokenizer.HasMoreTokens)
                    {
                        System.String tagToken = tagTokenizer.NextToken;

                        StringTokenizer tok = new StringTokenizer(tagToken, "-");
                        while (tok.HasMoreTokens)
                        {
                            System.String t            = tok.NextToken;
                            int[]         fullTagIDSet = tagSet.getTags(t);

                            if (fullTagIDSet != null)
                            {
                                for (int i = 0; i < fullTagIDSet.Length; i++)
                                {
                                    tagSetA.Add(fullTagIDSet[i]);
                                }
                            }
                            else
                            {
                                tagSetA.Add(tagSet.getTagID(t));
                            }
                            while (tok.HasMoreTokens)
                            {
                                tagSetA.Remove(tagSet.getTagID(tok.NextToken));
                            }
                        }
                    }

                    tagTokenizer = new StringTokenizer(tagLists[1], ",()");
                    while (tagTokenizer.HasMoreTokens)
                    {
                        System.String tagToken = tagTokenizer.NextToken;

                        StringTokenizer tok = new StringTokenizer(tagToken, "-");
                        while (tok.HasMoreTokens)
                        {
                            System.String t            = tok.NextToken;
                            int[]         fullTagIDSet = tagSet.getTags(t);

                            if (fullTagIDSet != null)
                            {
                                for (int i = 0; i < fullTagIDSet.Length; i++)
                                {
                                    tagSetB.Add(fullTagIDSet[i]);
                                }
                            }
                            else
                            {
                                tagSetB.Add(tagSet.getTagID(t));
                            }
                            while (tok.HasMoreTokens)
                            {
                                tagSetB.Remove(tagSet.getTagID(tok.NextToken));
                            }
                        }
                    }

                    IEnumerator <int> iterA = tagSetA.GetEnumerator();
                    while (iterA.MoveNext())
                    {
                        int leftSide            = iterA.Current;
                        IEnumerator <int> iterB = tagSetB.GetEnumerator();

                        while (iterB.MoveNext())
                        {
                            connectionTable[leftSide][iterB.Current] = true;
                        }
                    }

                    tagSetA.Clear();
                    tagSetB.Clear();
                }
                else if ("START_TAG".Equals(lineToken))
                {
                    startTag = lineTokenizer.NextToken;
                }
            }
            br.Close();
        }
        /// <summary> Initializes the Chart-based Morphological Analyzer plug-in.</summary>
        /// <param name="baseDir">- the path for base directory, which should have the 'conf' and 'data' directory
        /// </param>
        /// <param name="configFile">- the path for the configuration file (relative path to the base directory)
        /// </param>
        public virtual void initialize(System.String baseDir, System.String configFile)
        {
            JSONReader json = new JSONReader(configFile);

            fileDicSystem = baseDir + "/" + json.getValue("dic_system");
            fileDicUser = baseDir + "/" + json.getValue("dic_user");
            fileConnections = baseDir + "/" + json.getValue("connections");
            fileConnectionsNot = baseDir + "/" + json.getValue("connections_not");
            fileDicAnalyzed = baseDir + "/" + json.getValue("dic_analyzed");
            fileTagSet = baseDir + "/" + json.getValue("tagset");

            tagSet = new TagSet();
            tagSet.init(fileTagSet, TagSet.TAG_SET_KAIST);

            connection = new Connection();
            connection.init(fileConnections, tagSet.TagCount, tagSet);

            connectionNot = new ConnectionNot();
            connectionNot.init(fileConnectionsNot, tagSet);

            analyzedDic = new AnalyzedDic();
            analyzedDic.readDic(fileDicAnalyzed);

            systemDic = new Trie(Trie.DEFAULT_TRIE_BUF_SIZE_SYS);
            systemDic.read_dic(fileDicSystem, tagSet);

            userDic = new Trie(Trie.DEFAULT_TRIE_BUF_SIZE_USER);
            userDic.read_dic(fileDicUser, tagSet);

            numDic = new NumberDic();
            simti = new Simti();
            simti.init();
            eojeolList = new LinkedList < Eojeol >();

            chart = new MorphemeChart(tagSet, connection, systemDic, userDic, numDic, simti, eojeolList);

            postProc = new PostProcessor();
        }
Example #15
0
        /// <summary> Reads the impossible connection rules from the specified file.</summary>
        /// <param name="filePath">- the file for the impossible connection rules
        /// </param>
        /// <param name="tagSet">- the morpheme tag set used in the rules
        /// </param>
        /// <throws>  IOException </throws>
        private void readFile(System.String filePath, TagSet tagSet)
        {
            System.IO.StreamReader br = new System.IO.StreamReader(
                new System.IO.FileStream(filePath, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.UTF8);
            System.String line = null;

            List< String > ruleList = new List< String >();

            title = "";
            version = "";
            copyright = "";
            author = "";
            date = "";
            editor = "";
            startTag = "";
            ruleCount = 0;

            while ((line = br.ReadLine()) != null)
            {
                StringTokenizer lineTokenizer = new StringTokenizer(line, "\t");
                if (lineTokenizer.HasMoreTokens == false)
                {
                    continue;
                }

                System.String lineToken = lineTokenizer.NextToken;

                if (lineToken.StartsWith("@"))
                {
                    if ("@title".Equals(lineToken))
                    {
                        title = lineTokenizer.NextToken;
                    }
                    else if ("@version".Equals(lineToken))
                    {
                        version = lineTokenizer.NextToken;
                    }
                    else if ("@copyright".Equals(lineToken))
                    {
                        copyright = lineTokenizer.NextToken;
                    }
                    else if ("@author".Equals(lineToken))
                    {
                        author = lineTokenizer.NextToken;
                    }
                    else if ("@date".Equals(lineToken))
                    {
                        date = lineTokenizer.NextToken;
                    }
                    else if ("@editor".Equals(lineToken))
                    {
                        editor = lineTokenizer.NextToken;
                    }
                }
                else if ("CONNECTION_NOT".Equals(lineToken))
                {
                    ruleList.Add(lineTokenizer.NextToken);
                }
            }

            ruleCount = ruleList.Count;

            notTagTable = new int[ruleCount][];
            for (int i = 0; i < ruleCount; i++)
            {
                notTagTable[i] = new int[2];
            }
            notMorphTable = new System.String[ruleCount][];
            for (int i2 = 0; i2 < ruleCount; i2++)
            {
                notMorphTable[i2] = new System.String[2];
            }

            IEnumerator<string> iter = ruleList.GetEnumerator();
            for (int i = 0; iter.MoveNext(); i++)
            {
                System.String rule = iter.Current;
                StringTokenizer st = new StringTokenizer(rule, " ");
                notMorphTable[i][0] = st.NextToken;
                notTagTable[i][0] = tagSet.getTagID(st.NextToken);
                notMorphTable[i][1] = st.NextToken;
                notTagTable[i][1] = tagSet.getTagID(st.NextToken);
            }

            ruleList.Clear();
            br.Close();
        }
Example #16
0
 /// <summary> Initializes the object with the specified file for impossible connection rules.</summary>
 /// <param name="filePath">- the file for the impossible connection rules
 /// </param>
 /// <param name="tagSet">- the morpheme tag set used in the rules
 /// </param>
 /// <throws>  IOException </throws>
 public virtual void  init(System.String filePath, TagSet tagSet)
 {
     readFile(filePath, tagSet);
 }
Example #17
0
 /// <summary> Initialize the connection rules from the rule data file.</summary>
 /// <param name="filePath">- the path for the connection rule data file
 /// </param>
 /// <param name="tagCount">- the number of the total tags
 /// </param>
 /// <param name="tagSet">- the tag set which is used in the connection rules
 /// </param>
 /// <throws>  IOException </throws>
 public virtual void init(System.String filePath, int tagCount, TagSet tagSet)
 {
     readFile(filePath, tagCount, tagSet);
 }
Example #18
0
        /// <summary> Reads the impossible connection rules from the specified file.</summary>
        /// <param name="filePath">- the file for the impossible connection rules
        /// </param>
        /// <param name="tagSet">- the morpheme tag set used in the rules
        /// </param>
        /// <throws>  IOException </throws>
        private void  readFile(System.String filePath, TagSet tagSet)
        {
            System.IO.StreamReader br = new System.IO.StreamReader(
                new System.IO.FileStream(filePath, System.IO.FileMode.Open, System.IO.FileAccess.Read), System.Text.Encoding.UTF8);
            System.String line = null;

            List <String> ruleList = new List <String>();

            title     = "";
            version   = "";
            copyright = "";
            author    = "";
            date      = "";
            editor    = "";
            startTag  = "";
            ruleCount = 0;

            while ((line = br.ReadLine()) != null)
            {
                StringTokenizer lineTokenizer = new StringTokenizer(line, "\t");
                if (lineTokenizer.HasMoreTokens == false)
                {
                    continue;
                }

                System.String lineToken = lineTokenizer.NextToken;

                if (lineToken.StartsWith("@"))
                {
                    if ("@title".Equals(lineToken))
                    {
                        title = lineTokenizer.NextToken;
                    }
                    else if ("@version".Equals(lineToken))
                    {
                        version = lineTokenizer.NextToken;
                    }
                    else if ("@copyright".Equals(lineToken))
                    {
                        copyright = lineTokenizer.NextToken;
                    }
                    else if ("@author".Equals(lineToken))
                    {
                        author = lineTokenizer.NextToken;
                    }
                    else if ("@date".Equals(lineToken))
                    {
                        date = lineTokenizer.NextToken;
                    }
                    else if ("@editor".Equals(lineToken))
                    {
                        editor = lineTokenizer.NextToken;
                    }
                }
                else if ("CONNECTION_NOT".Equals(lineToken))
                {
                    ruleList.Add(lineTokenizer.NextToken);
                }
            }

            ruleCount = ruleList.Count;

            notTagTable = new int[ruleCount][];
            for (int i = 0; i < ruleCount; i++)
            {
                notTagTable[i] = new int[2];
            }
            notMorphTable = new System.String[ruleCount][];
            for (int i2 = 0; i2 < ruleCount; i2++)
            {
                notMorphTable[i2] = new System.String[2];
            }

            IEnumerator <string> iter = ruleList.GetEnumerator();

            for (int i = 0; iter.MoveNext(); i++)
            {
                System.String   rule = iter.Current;
                StringTokenizer st   = new StringTokenizer(rule, " ");
                notMorphTable[i][0] = st.NextToken;
                notTagTable[i][0]   = tagSet.getTagID(st.NextToken);
                notMorphTable[i][1] = st.NextToken;
                notTagTable[i][1]   = tagSet.getTagID(st.NextToken);
            }

            ruleList.Clear();
            br.Close();
        }
Example #19
0
        /// <summary> Checks whether two morpheme tags can appear consecutively.</summary>
        /// <param name="tagSet">- morpheme tag set
        /// </param>
        /// <param name="tag1">- the first morpheme tag to check
        /// </param>
        /// <param name="tag2">- the second morpheme tag to check
        /// </param>
        /// <param name="len1">- the length of the first morpheme
        /// </param>
        /// <param name="len2">- the length of the second morpheme
        /// </param>
        /// <param name="typeOfTag2">- the tag type of the second morpheme tag
        /// </param>
        /// <returns> true: the two consecutive morpheme tags can appear, false: they cannot appear
        /// </returns>
        public virtual bool checkConnection(TagSet tagSet, int tag1, int tag2, int len1, int len2, int typeOfTag2)
        {
            System.String tag1Name = tagSet.getTagName(tag1);
            System.String tag2Name = tagSet.getTagName(tag2);

            if ((tag1Name.StartsWith("nc") || tag1Name[0] == 'f') && tag2Name[0] == 'n')
            {
                if (tag2Name.StartsWith("nq"))
                {
                    return false;
                }
                else if (len1 < 4 || len2 < 2)
                {
                    return false;
                }
            }

            //		System.err.println(tag1Name + "\t" + tag2Name + ": " + connectionTable[tag1][tag2] + " " + tagSet.checkTagType(nextTagType, tag2));
            return connectionTable[tag1][tag2] && tagSet.checkTagType(typeOfTag2, tag2);
        }
Example #20
0
 /// <summary> Constructor.</summary>
 /// <param name="mc">- the lattice style morpheme chart
 /// </param>
 /// <param name="tagSet">- morpheme tag set
 /// </param>
 public Exp(MorphemeChart mc, TagSet tagSet)
 {
     this.mc     = mc;
     this.tagSet = tagSet;
     pset_end    = pset.Length;
 }