Ejemplo n.º 1
0
        private static DictBase GenerateDatabaseFull(List <string> filterUWords, List <string> filterKWords)
        {
            BidirectionalGraph <string, Edge <string> > graph = new BidirectionalGraph <string, Edge <string> >(false);


            char[]   spliter  = new char[] { ',' };
            DictBase dictBase = new DictBase();
            //string[] lines = new string[] { };// System.IO.File.ReadAllLines(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "new_CnUy2007.txt"));
            //string[] freqWords = System.IO.File.ReadAllLines(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "freqwords.txt"));

            /*Dictionary<string, string> irpan = new Dictionary<string, string>();
             * foreach (var item in lines)
             * {
             *  string c = item.Split('\t')[0];
             *  string u = item.Split('\t')[1];
             *
             *  if (irpan.ContainsKey(c))
             *      continue;
             *  irpan.Add(c, u);
             * }*/

            //string[] lines_preStepResult = System.IO.File.ReadAllLines(System.IO.Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "bi_pairs_preStep.txt"));
            Dictionary <string, bool> filterUWords_dict = filterUWords.Select(t => t.Trim()).Distinct().ToDictionary(t => t, t => true);
            Dictionary <string, bool> filterKWords_dict = filterKWords.Select(t => t.Trim()).Distinct().ToDictionary(t => t, t => true);



            Dictionary <string, int> cWordDict = new Dictionary <string, int>();
            Dictionary <string, int> uWordDict = new Dictionary <string, int>();
            Dictionary <string, int> kWordDict = new Dictionary <string, int>();

            //Dictionary<int, string> cWordPOSDict = new Dictionary<int, string>();
            //Dictionary<int, string> uWordPOSDict = new Dictionary<int, string>();
            //Dictionary<int, string> kWordPOSDict = new Dictionary<int, string>();

            /*
             *
             * HDictInduction.Console.zuk_dbSQLDataSetTableAdapters.zuk_fixedTableAdapter zukAdapter = new zuk_fixedTableAdapter();
             * zuk_dbSQLDataSet.zuk_fixedDataTable zukTable = new zuk_dbSQLDataSet.zuk_fixedDataTable();
             * zukAdapter.Fill(zukTable);
             */

            zuk_dbSQLDataSet.zuk_fixedDataTable zukTable = new zuk_dbSQLDataSet.zuk_fixedDataTable();
            if (_ZukTable != null)
            {
                zukTable = _ZukTable;
            }
            else
            {
                try
                {
                    zukTable.ReadXml(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "zukTable.xml"));
                }
                catch (Exception ex)
                {
                    MessageBox.Show(ex.ToString());
                    throw;
                }
                _ZukTable = zukTable;
            }

            //Debug.Write("Before loop");
            //int inLoop = 0;
            foreach (zuk_dbSQLDataSet.zuk_fixedRow row in zukTable)
            {
                //Debug.WriteLine(++inLoop);
                string strChinese = row.Zh.Trim();
                //string[] strUyghurs = irpan.ContainsKey(strChinese)?irpan[strChinese].Split(spliter, StringSplitOptions.RemoveEmptyEntries)
                //    : row.Ug.Split(spliter, StringSplitOptions.RemoveEmptyEntries);
                string[] strUyghurs = row.Ug.Split(spliter, StringSplitOptions.RemoveEmptyEntries);
                string[] strKazaks  = row.Kz.Split(spliter, StringSplitOptions.RemoveEmptyEntries);

                strUyghurs = strUyghurs.Select(t => t.Trim()).Distinct().ToArray();
                strKazaks  = strKazaks.Select(t => t.Trim()).Distinct().ToArray();

                //commit filtering
                strUyghurs = strUyghurs.Where(t => !filterUWords_dict.ContainsKey(t)).ToArray();
                strKazaks  = strKazaks.Where(t => !filterKWords_dict.ContainsKey(t)).ToArray();

                int[] strUyghursIDs = new int[strUyghurs.Length];
                int[] strKazaksIDs  = new int[strKazaks.Length];
                int   cID           = cWordDict.Count;

                /* For data with Part of speech
                 * string[] idPOS = strChinese.Split(new char[] { '-' }, 2);
                 * strChinese = idPOS[1];
                 */

                if (cWordDict.ContainsKey(strChinese))
                {
                    throw new Exception("Multiple Chinese Word!");
                }
                else
                {
                    //string chPOS = strChinese.Substring(0, 2);
                    //string chWOrd = strChinese.Substring(3);
                    //cWordPOSDict.Add(cID, chPOS);
                    cWordDict.Add(strChinese, cID);
                }

                //u
                for (int i = 0; i < strUyghurs.Length; i++)
                {
                    //string uPOS = strUyghurs[i].Substring(0, 2);
                    //string uWOrd = strUyghurs[i].Substring(3);

                    /* For data with Part of speech
                     * string[] mkPOS = strUyghurs[i].Split(new char[] { '-' }, 2);
                     * string mk = mkPOS[1];
                     *
                     * if (!uWordDict.ContainsKey(mk))//strUyghurs[i]))
                     */
                    if (!uWordDict.ContainsKey(strUyghurs[i]))
                    {
                        strUyghursIDs[i] = uWordDict.Count;
                        //uWordPOSDict.Add(uWordDict.Count, uPOS);
                        //uWordDict.Add(uWOrd, uWordDict.Count);
                        uWordDict.Add(strUyghurs[i], uWordDict.Count);

                        /* For data with Part of speech
                         * uWordDict.Add(mk, uWordDict.Count);
                         */
                    }
                    else
                    {
                        /* For data with Part of speech
                         * strUyghursIDs[i] = uWordDict[mk];//strUyghurs[i]];
                         */
                        strUyghursIDs[i] = uWordDict[strUyghurs[i]];
                    }
                }

                //k
                for (int i = 0; i < strKazaks.Length; i++)
                {
                    //string kPOS = strKazaks[i].Substring(0, 2);
                    //string kWOrd = strKazaks[i].Substring(3);
                    //string[] malayPOS = strKazaks[i].Split(new char[] { '-' }, 2);

                    /* For data with Part of speech
                     * string[] malayPOS = strKazaks[i].Split(new char[] { '-' }, 3);
                     * //string statMs = malayPOS[0];
                     * string ms = malayPOS[2];
                     */
                    //string pair = strChinese + "_" + ms;

                    /* For data with Part of speech
                     * if (!kWordDict.ContainsKey(ms))//strKazaks[i]))
                     */
                    if (!kWordDict.ContainsKey(strKazaks[i]))
                    {
                        strKazaksIDs[i] = kWordDict.Count;
                        //kWordPOSDict.Add(kWordDict.Count, kPOS);
                        //kWordDict.Add(kWOrd, kWordDict.Count);
                        kWordDict.Add(strKazaks[i], kWordDict.Count);

                        /* For data with Part of speech
                         * kWordDict.Add(ms, kWordDict.Count);
                         */
                    }
                    else
                    {
                        strKazaksIDs[i] = kWordDict[strKazaks[i]];
                    }

                    /* For data with Part of speech
                     * strKazaksIDs[i] = kWordDict[ms];//strKazaks[i]];
                     */
                    //wnStatDict.Add(pair, float.Parse(statMs)); //Add id-ms translation wordnet statistic
                }

                //c
                dictBase.CUDictbase.CtoU.Add(cID, strUyghursIDs);
                dictBase.CKDictbase.CtoK.Add(cID, strKazaksIDs);

                //u
                foreach (int item in strUyghursIDs)
                {
                    if (!dictBase.CUDictbase.UtoC.ContainsKey(item))
                    {
                        dictBase.CUDictbase.UtoC.Add(item, new int[1] {
                            cID
                        });
                    }
                    else
                    {
                        int[] array = dictBase.CUDictbase.UtoC[item];
                        if (!array.Contains(cID))
                        {
                            Array.Resize(ref array, array.Length + 1);
                            array[array.Length - 1]        = cID;
                            dictBase.CUDictbase.UtoC[item] = array;
                        }
                    }
                }

                //k
                foreach (int item in strKazaksIDs)
                {
                    if (!dictBase.CKDictbase.KtoC.ContainsKey(item))
                    {
                        dictBase.CKDictbase.KtoC.Add(item, new int[1] {
                            cID
                        });
                    }
                    else
                    {
                        int[] array = dictBase.CKDictbase.KtoC[item];
                        if (!array.Contains(cID))
                        {
                            Array.Resize(ref array, array.Length + 1);
                            array[array.Length - 1]        = cID;
                            dictBase.CKDictbase.KtoC[item] = array;
                        }
                    }
                }
            }

            dictBase.CWords = cWordDict.Keys.ToArray();
            dictBase.KWords = kWordDict.Keys.ToArray();
            dictBase.UWords = uWordDict.Keys.ToArray();


            //temp check Kyrgiz dictionary ---------------
            //Dictionary<string, string[]> krdb = new Dictionary<string, string[]>();
            //List<string> list = new List<string>();
            //Dictionary<string, bool> krUnique = new Dictionary<string, bool>();
            //String fileName = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "zh_pn_kr.txt");

            //int index = 0;
            //foreach (var line in File.ReadAllLines(fileName))
            //{
            //    string[] cols = line.Split('\t');
            //    if (cols.Length != 3)
            //        throw new Exception(index.ToString());
            //    string zh = cols[0].Trim();
            //    string[] krs = cols[2].Trim('"').Split('،', ',');
            //    if (krs.Length == 0)
            //        throw new Exception(index.ToString());
            //    list.Clear();
            //    foreach (var kr in krs)
            //    {
            //        if (string.IsNullOrEmpty(kr.Trim()))
            //            continue;
            //        list.Add(kr.Trim());
            //    }
            //    if (list.Count == 0)
            //        throw new Exception(index.ToString());
            //    krdb.Add(zh, list.ToArray());
            //    index++;
            //}

            //int gCount = 0;
            //int gPairCount =0;
            //int gUnique = 0;
            //foreach (var zh in dictBase.CWords)
            //{
            //    if(krdb.ContainsKey(zh))
            //    {
            //        gCount++;
            //        gPairCount += krdb[zh].Length;

            //        foreach (var kr in krdb[zh])
            //        {
            //            if (!krUnique.ContainsKey(kr))
            //                krUnique.Add(kr, true);
            //        }
            //    }
            //}
            //gUnique = krUnique.Count;

            ////--------------------------------------------



            return(dictBase);
        }
Ejemplo n.º 2
0
        public static QuickGraph.UndirectedGraph <string, Edge <string> > DatabaseToGraph(string dbFileName)
        {
            Dictionary <string, string> irpan = new Dictionary <string, string>();

            if (false)
            {
                string[] lines = System.IO.File.ReadAllLines(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "new_CnUy2007.txt"));
                foreach (var item in lines)
                {
                    string c = item.Split('\t')[0];
                    string u = item.Split('\t')[1];

                    if (irpan.ContainsKey(c))
                    {
                        continue;
                    }
                    irpan.Add(c, u);
                }
            }

            char[]   spliter  = new char[] { ',' };
            DictBase dictBase = new DictBase();
            Dictionary <string, int> cWordDict = new Dictionary <string, int>();
            Dictionary <string, int> uWordDict = new Dictionary <string, int>();
            Dictionary <string, int> kWordDict = new Dictionary <string, int>();

            Dictionary <int, string> ccWordDict = new Dictionary <int, string>();
            Dictionary <int, string> uuWordDict = new Dictionary <int, string>();
            Dictionary <int, string> kkWordDict = new Dictionary <int, string>();

            QuickGraph.UndirectedGraph <string, Edge <string> > graph = new UndirectedGraph <string, Edge <string> >(false);

            zuk_dbSQLDataSet.zuk_fixedDataTable zukTable = new zuk_dbSQLDataSet.zuk_fixedDataTable();
            if (dbFileName.IndexOf("\\") > -1)
            {
                zukTable.ReadXml(dbFileName);
            }
            else
            {
                zukTable.ReadXml(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, dbFileName));
            }

            foreach (zuk_dbSQLDataSet.zuk_fixedRow row in zukTable)
            {
                string   strChinese = row.Zh.Trim();
                string[] strUyghurs;// = row.Ug.Split(spliter, StringSplitOptions.RemoveEmptyEntries);
                if (irpan.ContainsKey(strChinese))
                {
                    strUyghurs = irpan[strChinese].Split(spliter, StringSplitOptions.RemoveEmptyEntries);
                }
                else
                {
                    strUyghurs = row.Ug.Split(spliter, StringSplitOptions.RemoveEmptyEntries);
                }
                string[] strKazaks = row.Kz.Split(spliter, StringSplitOptions.RemoveEmptyEntries);

                //trim
                for (int i = 0; i < strUyghurs.Length; i++)
                {
                    strUyghurs[i] = strUyghurs[i].Trim();
                }
                for (int i = 0; i < strKazaks.Length; i++)
                {
                    strKazaks[i] = strKazaks[i].Trim();
                }

                //add to db
                int cID = cWordDict.Count;
                cWordDict.Add(strChinese, cID);
                ccWordDict.Add(cID, strChinese);
                graph.AddVertex("c" + cID);

                //u
                for (int i = 0; i < strUyghurs.Length; i++)
                {
                    if (!uWordDict.ContainsKey(strUyghurs[i]))
                    {
                        int uID = uWordDict.Count;
                        uWordDict.Add(strUyghurs[i], uID);
                        uuWordDict.Add(uID, strUyghurs[i]);
                        graph.AddVertex("u" + uID);
                        graph.AddEdge(new Edge <string>("c" + cID.ToString(), "u" + uID.ToString()));
                    }
                    else
                    {
                        graph.AddEdge(new Edge <string>("c" + cID, "u" + uWordDict[strUyghurs[i]]));
                    }
                }

                //k
                for (int i = 0; i < strKazaks.Length; i++)
                {
                    if (!kWordDict.ContainsKey(strKazaks[i]))
                    {
                        int kID = kWordDict.Count;
                        kWordDict.Add(strKazaks[i], kID);
                        kkWordDict.Add(kID, strKazaks[i]);
                        graph.AddVertex("k" + kID);
                        graph.AddEdge(new Edge <string>("c" + cID.ToString(), "k" + kID.ToString()));
                    }
                    graph.AddEdge(new Edge <string>("c" + cID, "k" + kWordDict[strKazaks[i]]));
                }
            }



            var maxU  = graph.Vertices.Where <string>(t => t.StartsWith("u")).OrderByDescending(t => graph.AdjacentEdges(t).Count());
            var maxU2 = maxU.ToDictionary(t => uuWordDict[int.Parse(t.TrimStart('u'))], t => graph.AdjacentEdges(t).Count());

            var maxK  = graph.Vertices.Where <string>(t => t.StartsWith("k")).OrderByDescending(t => graph.AdjacentEdges(t).Count());
            var maxK2 = maxK.ToDictionary(t => kkWordDict[int.Parse(t.TrimStart('k'))], t => graph.AdjacentEdges(t).Count());

            //Test

            foreach (var item in maxU2)
            {
                if (item.Value > 1)
                {
                    continue;
                }
                graph.RemoveVertex("u" + uWordDict[item.Key]);
            }

            foreach (var item in maxK2)
            {
                if (item.Value > 1)
                {
                    continue;
                }
                graph.RemoveVertex("k" + kWordDict[item.Key]);
            }


            IncrementalConnectedComponentsAlgorithm <string, Edge <string> >
            a = new IncrementalConnectedComponentsAlgorithm <string, Edge <string> >(graph as IMutableVertexAndEdgeSet <string, Edge <string> >);

            a.Compute();

            KeyValuePair <int, IDictionary <string, int> >      components          = a.GetComponents();
            List <BidirectionalGraph <string, Edge <string> > > connectedComponents = new List <BidirectionalGraph <string, Edge <string> > >(components.Key);
            var grouped = components.Value.GroupBy(t => t.Value);

            foreach (var group in grouped)
            {
                BidirectionalGraph <string, Edge <string> > g = new BidirectionalGraph <string, Edge <string> >(true, group.Count());

                foreach (var item in group)
                {
                    g.AddVertex(item.Key);
                }

                foreach (var item in g.Vertices)
                {
                    g.AddEdgeRange(graph.AdjacentEdges(item));
                }

                connectedComponents.Add(g);
            }

            var connectedComponentsSorted = connectedComponents.OrderByDescending(t => t.VertexCount).ToList();


            return(graph);
        }