private static DictBase GenerateDatabaseFull(List <string> filterUWords, List <string> filterKWords) { BidirectionalGraph <string, Edge <string> > graph = new BidirectionalGraph <string, Edge <string> >(false); char[] spliter = new char[] { ',' }; DictBase dictBase = new DictBase(); //string[] lines = new string[] { };// System.IO.File.ReadAllLines(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "new_CnUy2007.txt")); //string[] freqWords = System.IO.File.ReadAllLines(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "freqwords.txt")); /*Dictionary<string, string> irpan = new Dictionary<string, string>(); * foreach (var item in lines) * { * string c = item.Split('\t')[0]; * string u = item.Split('\t')[1]; * * if (irpan.ContainsKey(c)) * continue; * irpan.Add(c, u); * }*/ //string[] lines_preStepResult = System.IO.File.ReadAllLines(System.IO.Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "bi_pairs_preStep.txt")); Dictionary <string, bool> filterUWords_dict = filterUWords.Select(t => t.Trim()).Distinct().ToDictionary(t => t, t => true); Dictionary <string, bool> filterKWords_dict = filterKWords.Select(t => t.Trim()).Distinct().ToDictionary(t => t, t => true); Dictionary <string, int> cWordDict = new Dictionary <string, int>(); Dictionary <string, int> uWordDict = new Dictionary <string, int>(); Dictionary <string, int> kWordDict = new Dictionary <string, int>(); //Dictionary<int, string> cWordPOSDict = new Dictionary<int, string>(); //Dictionary<int, string> uWordPOSDict = new Dictionary<int, string>(); //Dictionary<int, string> kWordPOSDict = new Dictionary<int, string>(); /* * * HDictInduction.Console.zuk_dbSQLDataSetTableAdapters.zuk_fixedTableAdapter zukAdapter = new zuk_fixedTableAdapter(); * zuk_dbSQLDataSet.zuk_fixedDataTable zukTable = new zuk_dbSQLDataSet.zuk_fixedDataTable(); * zukAdapter.Fill(zukTable); */ zuk_dbSQLDataSet.zuk_fixedDataTable zukTable = new zuk_dbSQLDataSet.zuk_fixedDataTable(); if (_ZukTable != null) { zukTable = _ZukTable; } else { try { zukTable.ReadXml(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "zukTable.xml")); } catch (Exception ex) { MessageBox.Show(ex.ToString()); throw; } _ZukTable = zukTable; } //Debug.Write("Before loop"); //int inLoop = 0; foreach (zuk_dbSQLDataSet.zuk_fixedRow row in zukTable) { //Debug.WriteLine(++inLoop); string strChinese = row.Zh.Trim(); //string[] strUyghurs = irpan.ContainsKey(strChinese)?irpan[strChinese].Split(spliter, StringSplitOptions.RemoveEmptyEntries) // : row.Ug.Split(spliter, StringSplitOptions.RemoveEmptyEntries); string[] strUyghurs = row.Ug.Split(spliter, StringSplitOptions.RemoveEmptyEntries); string[] strKazaks = row.Kz.Split(spliter, StringSplitOptions.RemoveEmptyEntries); strUyghurs = strUyghurs.Select(t => t.Trim()).Distinct().ToArray(); strKazaks = strKazaks.Select(t => t.Trim()).Distinct().ToArray(); //commit filtering strUyghurs = strUyghurs.Where(t => !filterUWords_dict.ContainsKey(t)).ToArray(); strKazaks = strKazaks.Where(t => !filterKWords_dict.ContainsKey(t)).ToArray(); int[] strUyghursIDs = new int[strUyghurs.Length]; int[] strKazaksIDs = new int[strKazaks.Length]; int cID = cWordDict.Count; /* For data with Part of speech * string[] idPOS = strChinese.Split(new char[] { '-' }, 2); * strChinese = idPOS[1]; */ if (cWordDict.ContainsKey(strChinese)) { throw new Exception("Multiple Chinese Word!"); } else { //string chPOS = strChinese.Substring(0, 2); //string chWOrd = strChinese.Substring(3); //cWordPOSDict.Add(cID, chPOS); cWordDict.Add(strChinese, cID); } //u for (int i = 0; i < strUyghurs.Length; i++) { //string uPOS = strUyghurs[i].Substring(0, 2); //string uWOrd = strUyghurs[i].Substring(3); /* For data with Part of speech * string[] mkPOS = strUyghurs[i].Split(new char[] { '-' }, 2); * string mk = mkPOS[1]; * * if (!uWordDict.ContainsKey(mk))//strUyghurs[i])) */ if (!uWordDict.ContainsKey(strUyghurs[i])) { strUyghursIDs[i] = uWordDict.Count; //uWordPOSDict.Add(uWordDict.Count, uPOS); //uWordDict.Add(uWOrd, uWordDict.Count); uWordDict.Add(strUyghurs[i], uWordDict.Count); /* For data with Part of speech * uWordDict.Add(mk, uWordDict.Count); */ } else { /* For data with Part of speech * strUyghursIDs[i] = uWordDict[mk];//strUyghurs[i]]; */ strUyghursIDs[i] = uWordDict[strUyghurs[i]]; } } //k for (int i = 0; i < strKazaks.Length; i++) { //string kPOS = strKazaks[i].Substring(0, 2); //string kWOrd = strKazaks[i].Substring(3); //string[] malayPOS = strKazaks[i].Split(new char[] { '-' }, 2); /* For data with Part of speech * string[] malayPOS = strKazaks[i].Split(new char[] { '-' }, 3); * //string statMs = malayPOS[0]; * string ms = malayPOS[2]; */ //string pair = strChinese + "_" + ms; /* For data with Part of speech * if (!kWordDict.ContainsKey(ms))//strKazaks[i])) */ if (!kWordDict.ContainsKey(strKazaks[i])) { strKazaksIDs[i] = kWordDict.Count; //kWordPOSDict.Add(kWordDict.Count, kPOS); //kWordDict.Add(kWOrd, kWordDict.Count); kWordDict.Add(strKazaks[i], kWordDict.Count); /* For data with Part of speech * kWordDict.Add(ms, kWordDict.Count); */ } else { strKazaksIDs[i] = kWordDict[strKazaks[i]]; } /* For data with Part of speech * strKazaksIDs[i] = kWordDict[ms];//strKazaks[i]]; */ //wnStatDict.Add(pair, float.Parse(statMs)); //Add id-ms translation wordnet statistic } //c dictBase.CUDictbase.CtoU.Add(cID, strUyghursIDs); dictBase.CKDictbase.CtoK.Add(cID, strKazaksIDs); //u foreach (int item in strUyghursIDs) { if (!dictBase.CUDictbase.UtoC.ContainsKey(item)) { dictBase.CUDictbase.UtoC.Add(item, new int[1] { cID }); } else { int[] array = dictBase.CUDictbase.UtoC[item]; if (!array.Contains(cID)) { Array.Resize(ref array, array.Length + 1); array[array.Length - 1] = cID; dictBase.CUDictbase.UtoC[item] = array; } } } //k foreach (int item in strKazaksIDs) { if (!dictBase.CKDictbase.KtoC.ContainsKey(item)) { dictBase.CKDictbase.KtoC.Add(item, new int[1] { cID }); } else { int[] array = dictBase.CKDictbase.KtoC[item]; if (!array.Contains(cID)) { Array.Resize(ref array, array.Length + 1); array[array.Length - 1] = cID; dictBase.CKDictbase.KtoC[item] = array; } } } } dictBase.CWords = cWordDict.Keys.ToArray(); dictBase.KWords = kWordDict.Keys.ToArray(); dictBase.UWords = uWordDict.Keys.ToArray(); //temp check Kyrgiz dictionary --------------- //Dictionary<string, string[]> krdb = new Dictionary<string, string[]>(); //List<string> list = new List<string>(); //Dictionary<string, bool> krUnique = new Dictionary<string, bool>(); //String fileName = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "zh_pn_kr.txt"); //int index = 0; //foreach (var line in File.ReadAllLines(fileName)) //{ // string[] cols = line.Split('\t'); // if (cols.Length != 3) // throw new Exception(index.ToString()); // string zh = cols[0].Trim(); // string[] krs = cols[2].Trim('"').Split('،', ','); // if (krs.Length == 0) // throw new Exception(index.ToString()); // list.Clear(); // foreach (var kr in krs) // { // if (string.IsNullOrEmpty(kr.Trim())) // continue; // list.Add(kr.Trim()); // } // if (list.Count == 0) // throw new Exception(index.ToString()); // krdb.Add(zh, list.ToArray()); // index++; //} //int gCount = 0; //int gPairCount =0; //int gUnique = 0; //foreach (var zh in dictBase.CWords) //{ // if(krdb.ContainsKey(zh)) // { // gCount++; // gPairCount += krdb[zh].Length; // foreach (var kr in krdb[zh]) // { // if (!krUnique.ContainsKey(kr)) // krUnique.Add(kr, true); // } // } //} //gUnique = krUnique.Count; ////-------------------------------------------- return(dictBase); }
public static QuickGraph.UndirectedGraph <string, Edge <string> > DatabaseToGraph(string dbFileName) { Dictionary <string, string> irpan = new Dictionary <string, string>(); if (false) { string[] lines = System.IO.File.ReadAllLines(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "new_CnUy2007.txt")); foreach (var item in lines) { string c = item.Split('\t')[0]; string u = item.Split('\t')[1]; if (irpan.ContainsKey(c)) { continue; } irpan.Add(c, u); } } char[] spliter = new char[] { ',' }; DictBase dictBase = new DictBase(); Dictionary <string, int> cWordDict = new Dictionary <string, int>(); Dictionary <string, int> uWordDict = new Dictionary <string, int>(); Dictionary <string, int> kWordDict = new Dictionary <string, int>(); Dictionary <int, string> ccWordDict = new Dictionary <int, string>(); Dictionary <int, string> uuWordDict = new Dictionary <int, string>(); Dictionary <int, string> kkWordDict = new Dictionary <int, string>(); QuickGraph.UndirectedGraph <string, Edge <string> > graph = new UndirectedGraph <string, Edge <string> >(false); zuk_dbSQLDataSet.zuk_fixedDataTable zukTable = new zuk_dbSQLDataSet.zuk_fixedDataTable(); if (dbFileName.IndexOf("\\") > -1) { zukTable.ReadXml(dbFileName); } else { zukTable.ReadXml(Path.Combine(AppDomain.CurrentDomain.BaseDirectory, dbFileName)); } foreach (zuk_dbSQLDataSet.zuk_fixedRow row in zukTable) { string strChinese = row.Zh.Trim(); string[] strUyghurs;// = row.Ug.Split(spliter, StringSplitOptions.RemoveEmptyEntries); if (irpan.ContainsKey(strChinese)) { strUyghurs = irpan[strChinese].Split(spliter, StringSplitOptions.RemoveEmptyEntries); } else { strUyghurs = row.Ug.Split(spliter, StringSplitOptions.RemoveEmptyEntries); } string[] strKazaks = row.Kz.Split(spliter, StringSplitOptions.RemoveEmptyEntries); //trim for (int i = 0; i < strUyghurs.Length; i++) { strUyghurs[i] = strUyghurs[i].Trim(); } for (int i = 0; i < strKazaks.Length; i++) { strKazaks[i] = strKazaks[i].Trim(); } //add to db int cID = cWordDict.Count; cWordDict.Add(strChinese, cID); ccWordDict.Add(cID, strChinese); graph.AddVertex("c" + cID); //u for (int i = 0; i < strUyghurs.Length; i++) { if (!uWordDict.ContainsKey(strUyghurs[i])) { int uID = uWordDict.Count; uWordDict.Add(strUyghurs[i], uID); uuWordDict.Add(uID, strUyghurs[i]); graph.AddVertex("u" + uID); graph.AddEdge(new Edge <string>("c" + cID.ToString(), "u" + uID.ToString())); } else { graph.AddEdge(new Edge <string>("c" + cID, "u" + uWordDict[strUyghurs[i]])); } } //k for (int i = 0; i < strKazaks.Length; i++) { if (!kWordDict.ContainsKey(strKazaks[i])) { int kID = kWordDict.Count; kWordDict.Add(strKazaks[i], kID); kkWordDict.Add(kID, strKazaks[i]); graph.AddVertex("k" + kID); graph.AddEdge(new Edge <string>("c" + cID.ToString(), "k" + kID.ToString())); } graph.AddEdge(new Edge <string>("c" + cID, "k" + kWordDict[strKazaks[i]])); } } var maxU = graph.Vertices.Where <string>(t => t.StartsWith("u")).OrderByDescending(t => graph.AdjacentEdges(t).Count()); var maxU2 = maxU.ToDictionary(t => uuWordDict[int.Parse(t.TrimStart('u'))], t => graph.AdjacentEdges(t).Count()); var maxK = graph.Vertices.Where <string>(t => t.StartsWith("k")).OrderByDescending(t => graph.AdjacentEdges(t).Count()); var maxK2 = maxK.ToDictionary(t => kkWordDict[int.Parse(t.TrimStart('k'))], t => graph.AdjacentEdges(t).Count()); //Test foreach (var item in maxU2) { if (item.Value > 1) { continue; } graph.RemoveVertex("u" + uWordDict[item.Key]); } foreach (var item in maxK2) { if (item.Value > 1) { continue; } graph.RemoveVertex("k" + kWordDict[item.Key]); } IncrementalConnectedComponentsAlgorithm <string, Edge <string> > a = new IncrementalConnectedComponentsAlgorithm <string, Edge <string> >(graph as IMutableVertexAndEdgeSet <string, Edge <string> >); a.Compute(); KeyValuePair <int, IDictionary <string, int> > components = a.GetComponents(); List <BidirectionalGraph <string, Edge <string> > > connectedComponents = new List <BidirectionalGraph <string, Edge <string> > >(components.Key); var grouped = components.Value.GroupBy(t => t.Value); foreach (var group in grouped) { BidirectionalGraph <string, Edge <string> > g = new BidirectionalGraph <string, Edge <string> >(true, group.Count()); foreach (var item in group) { g.AddVertex(item.Key); } foreach (var item in g.Vertices) { g.AddEdgeRange(graph.AdjacentEdges(item)); } connectedComponents.Add(g); } var connectedComponentsSorted = connectedComponents.OrderByDescending(t => t.VertexCount).ToList(); return(graph); }