public double NumErrorsDepNoPunc(DependencyInstance inst, string pred, string act) { string[] actSpans = act.Split(' '); string[] predSpans = pred.Split(' '); string[] pos = inst.POS; int correct = 0; int numPunc = 0; for (int i = 0; i < predSpans.Length; i++) { string p = predSpans[i].Split(':')[0]; string a = actSpans[i].Split(':')[0]; if (pos[i + 1].Matches(@"[,:\.'`]+")) { numPunc++; continue; } if (p.Equals(a)) { correct++; } } return ((double) actSpans.Length - numPunc - correct); }
public KBestParseForest(int start, int end, DependencyInstance inst, int K) { this.K = K; Chart = new ParseForestItem[end + 1,end + 1,2,2,K]; m_start = start; m_end = end; m_sent = inst.Sentence; m_pos = inst.POS; }
public double NumErrorsDep(DependencyInstance inst, string pred, string act) { string[] actSpans = act.Split(' '); string[] predSpans = pred.Split(' '); int correct = 0; for (int i = 0; i < predSpans.Length; i++) { string p = predSpans[i].Split(':')[0]; string a = actSpans[i].Split(':')[0]; if (p.Equals(a)) { correct++; } } return ((double) actSpans.Length - correct); }
// same as decode, except return K best public object[,] DecodeNonProjective(DependencyInstance inst, FeatureVector[,,] fvs, double[,,] probs, FeatureVector[,,] fvsTrips, double[,,] probsTrips, FeatureVector[,,] fvsSibs, double[,,] probsSibs, FeatureVector[,,,] ntFvs, double[,,,] ntProbs, int K) { string[] toks = inst.Sentence; string[] pos = inst.POS; object[,] orig = DecodeProjective(inst, fvs, probs, fvsTrips, probsTrips, fvsSibs, probsSibs, ntFvs, ntProbs, 1); string[] o = ((string) orig[0, 1]).Split(' '); var par = new int[o.Length + 1]; var labs = new int[o.Length + 1]; labs[0] = 0; par[0] = -1; for (int i = 1; i < par.Length; i++) { par[i] = int.Parse(o[i - 1].Split("\\|".ToCharArray())[0]); labs[i] = m_pipe.Labeled ? int.Parse(o[i - 1].Split(':')[1]) : 0; } Rearrange(probs, probsTrips, probsSibs, ntProbs, par, labs); string pars = ""; for (int i = 1; i < par.Length; i++) pars += par[i] + "|" + i + ":" + labs[i] + " "; orig[0, 0] = ((DependencyPipe2O) m_pipe).CreateFeatureVector(toks, pos, labs, par); orig[0, 1] = pars.Trim(); return orig; }
public double NumErrors(DependencyInstance inst, string pred, string act) { if (LossType==LossTypes.NoPunc) return NumErrorsDepNoPunc(inst, pred, act) + NumErrorsLabelNoPunc(inst, pred, act); return NumErrorsDep(inst, pred, act) + NumErrorsLabel(inst, pred, act); }
public void UpdateParamsMIRA(DependencyInstance inst, object[,] d, double upd) { string actParseTree = inst.ActParseTree; FeatureVector actFV = inst.Fv; int K = 0; for (int i = 0; i < d.GetLength(0) && d[i, 0] != null; i++) { K = i + 1; } var b = new double[K]; var lamDist = new double[K]; var dist = new FeatureVector[K]; for (int k = 0; k < K; k++) { lamDist[k] = GetScore(actFV) - GetScore((FeatureVector) d[k, 0]); b[k] = NumErrors(inst, (string) d[k, 1], actParseTree); b[k] -= lamDist[k]; dist[k] = FeatureVector.GetDistVector(actFV, (FeatureVector) d[k, 0]); } double[] alpha = hildreth(dist, b); FeatureVector fv = null; int res = 0; for (int k = 0; k < K; k++) { fv = dist[k]; foreach (Feature feature in fv.FVector) { if (feature.Index < 0) continue; parameters[feature.Index] += alpha[k]*feature.Value; Total[feature.Index] += upd*alpha[k]*feature.Value; } } }
private void TrainingIter(DependencyInstance[] il, string trainfile, string train_forest, int iter) { int numUpd = 0; var in_ = new BinaryReader(new FileStream(train_forest, FileMode.Open)); bool evaluateI = true; for (int i = 0; i < il.Length; i++) { if ((i + 1)%100 == 0) Console.WriteLine(" " + (i + 1) + " instances"); DependencyInstance inst = il[i]; int length = inst.Length; // Get production crap. var fvs = new FeatureVector[length,length,2]; var probs = new double[length,length,2]; var ntFvs = new FeatureVector[length,m_pipe.Types.Length,2,2]; var ntProbs = new double[length,m_pipe.Types.Length,2,2]; var fvsTrips = new FeatureVector[length,length,length]; var probsTrips = new double[length,length,length]; var fvsSibs = new FeatureVector[length,length,2]; var probsSibs = new double[length,length,2]; if (SecondOrder) inst = ((DependencyPipe2O) m_pipe).GetFeatureVector(in_, inst, fvs, probs, fvsTrips, probsTrips, fvsSibs, probsSibs, ntFvs, ntProbs, m_params); else inst = m_pipe.ReadFeatureVector(in_, inst, fvs, probs, ntFvs, ntProbs, m_params); var upd = (double) (NumIters*il.Length - (il.Length*(iter - 1) + (i + 1)) + 1); int K = TrainK; object[,] d = null; if (DecodeType==ProjectiveTypes.Projective) { if (SecondOrder) d = ((DependencyDecoder2O) m_decoder).DecodeProjective(inst, fvs, probs, fvsTrips, probsTrips, fvsSibs, probsSibs, ntFvs, ntProbs, K); else d = m_decoder.DecodeProjective(inst, fvs, probs, ntFvs, ntProbs, K); } if (DecodeType==ProjectiveTypes.NonProjective) { if (SecondOrder) d = ((DependencyDecoder2O) m_decoder).DecodeNonProjective(inst, fvs, probs, fvsTrips, probsTrips, fvsSibs, probsSibs, ntFvs, ntProbs, K); else d = m_decoder.decodeNonProjective(inst, fvs, probs, ntFvs, ntProbs, K); } m_params.UpdateParamsMIRA(inst, d, upd); } Console.WriteLine(""); Console.WriteLine(" " + il.Length + " instances"); in_.Close(); }
public void Train(DependencyInstance[] il, string trainfile, string trainForest) { Console.WriteLine("About to Train"); Console.WriteLine("Num Feats: " + m_pipe.DataAlphabet.Count); int i = 0; for (i = 0; i < NumIters; i++) { Console.WriteLine("========================"); Console.WriteLine("Iteration: " + i); Console.WriteLine("========================"); Console.Write("Processed: "); long start = DateTime.Now.Ticks*10000; TrainingIter(il, trainfile, trainForest, i + 1); long end = DateTime.Now.Ticks*10000; Console.WriteLine("Training iter took: " + (end - start)); } m_params.AverageParams(i*il.Length); }
public override void WritePossibleFeatures(DependencyInstance inst, BinaryWriter out_) { string[] toks = inst.Sentence; string[] pos = inst.POS; string[] labs = inst.Labs; var posA = new string[pos.Length]; for (int i = 0; i < pos.Length; i++) { posA[i] = pos[i].SubstringWithIndex(0, 1); } try { for (int w1 = 0; w1 < toks.Length; w1++) { for (int w2 = w1 + 1; w2 < toks.Length; w2++) { for (int ph = 0; ph < 2; ph++) { bool attR = ph == 0 ? true : false; FeatureVector prodFV = CreateFeatureVector(toks, pos, posA, w1, w2, attR, new FeatureVector()); foreach (Feature feature in prodFV.FVector) { if (feature.Index >= 0) out_.Write(feature.Index); } out_.Write(-2); } } } out_.Write(-3); if (Labeled) { for (int w1 = 0; w1 < toks.Length; w1++) { for (int t = 0; t < Types.Length; t++) { string type = Types[t]; for (int ph = 0; ph < 2; ph++) { bool attR = ph == 0 ? true : false; for (int ch = 0; ch < 2; ch++) { bool child = ch == 0 ? true : false; FeatureVector prodFV = CreateFeatureVector(toks, pos, posA, w1, type, attR, child, new FeatureVector()); foreach (Feature feature in prodFV.FVector) { if (feature.Index >= 0) out_.Write(feature.Index); } out_.Write(-2); } } } } out_.Write(-3); } for (int w1 = 0; w1 < toks.Length; w1++) { for (int w2 = w1; w2 < toks.Length; w2++) { for (int w3 = w2 + 1; w3 < toks.Length; w3++) { FeatureVector prodFV = CreateFeatureVector(toks, pos, posA, w1, w2, w3, new FeatureVector()); foreach (Feature feature in prodFV.FVector) { if (feature.Index >= 0) out_.Write(feature.Index); } out_.Write(-2); } } for (int w2 = w1; w2 >= 0; w2--) { for (int w3 = w2 - 1; w3 >= 0; w3--) { FeatureVector prodFV = CreateFeatureVector(toks, pos, posA, w1, w2, w3, new FeatureVector()); foreach (Feature feature in prodFV.FVector) { if (feature.Index >= 0) out_.Write(feature.Index); } out_.Write(-2); } } } out_.Write(-3); for (int w1 = 0; w1 < toks.Length; w1++) { for (int w2 = 0; w2 < toks.Length; w2++) { for (int wh = 0; wh < 2; wh++) { if (w1 != w2) { FeatureVector prodFV = CreateFeatureVectorSib(toks, pos, w1, w2, wh == 0, new FeatureVector()); foreach (Feature feature in prodFV.FVector) { if (feature.Index >= 0) out_.Write(feature.Index); } out_.Write(-2); } } } } out_.Write(-3); foreach (Feature feature in inst.Fv.FVector) { out_.Write(feature.Index); } out_.Write(-4); out_.Write(inst.Sentence.Length); foreach (string s in inst.Sentence) { out_.Write(s); } out_.Write(inst.POS.Length); foreach (string s in inst.POS) { out_.Write(s); } out_.Write(-6); out_.Write(inst.Labs.Length); foreach (string s in inst.Labs) { out_.Write(s); } out_.Write(-7); out_.Write(inst.ActParseTree); out_.Write(-1); } catch (IOException) { } }
public void GetFeatureVector(DependencyInstance inst, FeatureVector[,,] fvs, double[,,] probs, FeatureVector[,,] fvsTrips, double[,,] probsTrips, FeatureVector[,,] fvsSibs, double[,,] probsSibs, FeatureVector[,,,] ntFvs, double[,,,] ntProbs, Parameters @params) { string[] toks = inst.Sentence; string[] pos = inst.POS; string[] labs = inst.Labs; var posA = new string[pos.Length]; for (int i = 0; i < pos.Length; i++) { posA[i] = pos[i].SubstringWithIndex(0, 1); } // Get production crap. for (int w1 = 0; w1 < toks.Length; w1++) { for (int w2 = w1 + 1; w2 < toks.Length; w2++) { for (int ph = 0; ph < 2; ph++) { bool attR = ph == 0 ? true : false; int childInt = attR ? w2 : w1; int parInt = attR ? w1 : w2; FeatureVector prodFV = CreateFeatureVector(toks, pos, posA, w1, w2, attR, new FeatureVector()); double prodProb = @params.GetScore(prodFV); fvs[w1, w2, ph] = prodFV; probs[w1, w2, ph] = prodProb; } } } if (Labeled) { for (int w1 = 0; w1 < toks.Length; w1++) { for (int t = 0; t < Types.Length; t++) { string type = Types[t]; for (int ph = 0; ph < 2; ph++) { bool attR = ph == 0 ? true : false; for (int ch = 0; ch < 2; ch++) { bool child = ch == 0 ? true : false; FeatureVector prodFV = CreateFeatureVector(toks, pos, posA, w1, type, attR, child, new FeatureVector()); double ntProb = @params.GetScore(prodFV); ntFvs[w1, t, ph, ch] = prodFV; ntProbs[w1, t, ph, ch] = ntProb; } } } } } for (int w1 = 0; w1 < toks.Length; w1++) { for (int w2 = w1; w2 < toks.Length; w2++) { for (int w3 = w2 + 1; w3 < toks.Length; w3++) { FeatureVector prodFV = CreateFeatureVector(toks, pos, posA, w1, w2, w3, new FeatureVector()); double prodProb = @params.GetScore(prodFV); fvsTrips[w1, w2, w3] = prodFV; probsTrips[w1, w2, w3] = prodProb; } } for (int w2 = w1; w2 >= 0; w2--) { for (int w3 = w2 - 1; w3 >= 0; w3--) { FeatureVector prodFV = CreateFeatureVector(toks, pos, posA, w1, w2, w3, new FeatureVector()); double prodProb = @params.GetScore(prodFV); fvsTrips[w1, w2, w3] = prodFV; probsTrips[w1, w2, w3] = prodProb; } } } for (int w1 = 0; w1 < toks.Length; w1++) { for (int w2 = 0; w2 < toks.Length; w2++) { for (int wh = 0; wh < 2; wh++) { if (w1 != w2) { FeatureVector prodFV = CreateFeatureVectorSib(toks, pos, w1, w2, wh == 0, new FeatureVector()); double prodProb = @params.GetScore(prodFV); fvsSibs[w1, w2, wh] = prodFV; probsSibs[w1, w2, wh] = prodProb; } } } } }
public DependencyInstance CreateInstance(ref string[] toks,ref string[] pos,out string []labs, out int[] deps) { ReadLines(ref toks,ref pos, out labs, out deps); FeatureVector fv = CreateFeatureVector(toks, pos, labs, deps); var pti = new DependencyInstance(toks, pos, labs, fv); string spans = ""; for (int i = 1; i < deps.Length; i++) { spans += deps[i]+"|" + i + ":"+TypeAlphabet.LookupIndex(labs[i])+" "; } pti.ActParseTree = spans.Trim(); return pti; }
public object[,] decodeNonProjective(DependencyInstance inst, FeatureVector[,,] fvs, double[,,] probs, FeatureVector[,,,] nt_fvs, double[,,,] nt_probs, int K) { string[] pos = inst.POS; int numWords = inst.Sentence.Length; var oldI = new int[numWords,numWords]; var oldO = new int[numWords,numWords]; var scoreMatrix = new double[numWords,numWords]; var orig_scoreMatrix = new double[numWords,numWords]; var curr_nodes = new bool[numWords]; var reps = new Dictionary<int, int>[numWords]; int[,] static_types = null; if (m_pipe.Labeled) { static_types = GetTypes(nt_probs, pos.Length); } for (int i = 0; i < numWords; i++) { curr_nodes[i] = true; reps[i] = new Dictionary<int, int>(); reps[i].Add(i, 0); for (int j = 0; j < numWords; j++) { // score of edge (i,j) i --> j scoreMatrix[i, j] = probs[i < j ? i : j, i < j ? j : i, i < j ? 0 : 1] + (m_pipe.Labeled ? nt_probs[i, static_types[i, j], i < j ? 0 : 1, 1] + nt_probs[j, static_types[i, j], i < j ? 0 : 1, 0] : 0.0); orig_scoreMatrix[i, j] = probs[i < j ? i : j, i < j ? j : i, i < j ? 0 : 1] + (m_pipe.Labeled ? nt_probs[i, static_types[i, j], i < j ? 0 : 1, 1] + nt_probs[j, static_types[i, j], i < j ? 0 : 1, 0] : 0.0); oldI[i, j] = i; oldO[i, j] = j; if (i == j || j == 0) continue; // no self loops of i --> 0 } } Dictionary<int, int> final_edges = chuLiuEdmonds(scoreMatrix, curr_nodes, oldI, oldO, false, new Dictionary<int, int>(), reps); var par = new int[numWords]; int[] ns = final_edges.Keys.ToArray(); for (int i = 0; i < ns.Length; i++) { int ch = ns[i]; int pr = final_edges[ns[i]]; par[ch] = pr; } int[] n_par = getKChanges(par, orig_scoreMatrix, Math.Min(K, par.Length)); int new_k = 1; for (int i = 0; i < n_par.Length; i++) if (n_par[i] > -1) new_k++; // Create Feature Vectors; var fin_par = new int[new_k,numWords]; int fin_parFirstLen = new_k; int fin_par_secondLen = numWords; var fin_fv = new FeatureVector[new_k,numWords]; int len = fin_par.GetLength(1); for (int i = 0; i < len; i++) { fin_par[0, i] = par[i]; } int c = 1; for (int i = 0; i < n_par.Length; i++) { if (n_par[i] > -1) { var t_par = new int[par.Length]; for (int j = 0; j < t_par.Length; j++) t_par[j] = par[j]; t_par[i] = n_par[i]; len = t_par.Length; for (int ct = 0; ct < len; ct++) { fin_par[c, ct] = t_par[ct]; } c++; } } for (int k = 0; k < fin_parFirstLen; k++) { for (int i = 0; i < fin_par_secondLen; i++) { int ch = i; int pr = fin_par[k, i]; if (pr != -1) { fin_fv[k, ch] = fvs[ch < pr ? ch : pr, ch < pr ? pr : ch, ch < pr ? 1 : 0]; if (m_pipe.Labeled) { fin_fv[k, ch] = FeatureVector.Cat(fin_fv[k, ch], nt_fvs[ch, static_types[pr, ch], ch < pr ? 1 : 0, 0]); fin_fv[k, ch] = FeatureVector.Cat(fin_fv[k, ch], nt_fvs[pr, static_types[pr, ch], ch < pr ? 1 : 0, 1]); } } else { fin_fv[k, ch] = new FeatureVector(); } } } var fin = new FeatureVector[new_k]; var result = new string[new_k]; for (int k = 0; k < fin.Length; k++) { fin[k] = new FeatureVector(); for (int i = 1; i < fin_fv.GetLength(k); i++) //doubt of Index fin[k] = FeatureVector.Cat(fin_fv[k, i], fin[k]); result[k] = ""; for (int i = 1; i < par.Length; i++) result[k] += fin_par[k, i] + "|" + i + (m_pipe.Labeled ? ":" + static_types[fin_par[k, i], i] : ":0") + " "; } // create d. var d = new object[new_k,2]; for (int k = 0; k < new_k; k++) { d[k, 0] = fin[k]; d[k, 1] = result[k].Trim(); } return d; }
// same as decode, except return K best public object[,] DecodeProjective(DependencyInstance inst, FeatureVector[,,] fvs, double[,,] probs, FeatureVector[,,] fvsTrips, double[,,] probsTrips, FeatureVector[,,] fvsSibs, double[,,] probsSibs, FeatureVector[,,,] ntFvs, double[,,,] ntProbs, int K) { string[] toks = inst.Sentence; string[] pos = inst.POS; int[,] staticTypes = null; if (m_pipe.Labeled) { staticTypes = GetTypes(ntProbs, toks.Length); } var pf = new KBestParseForest2O(0, toks.Length - 1, inst, K); for (int s = 0; s < toks.Length; s++) { pf.Add(s, -1, 0, 0.0, new FeatureVector()); pf.Add(s, -1, 1, 0.0, new FeatureVector()); } for (int j = 1; j < toks.Length; j++) { for (int s = 0; s < toks.Length && s + j < toks.Length; s++) { int t = s + j; FeatureVector prodFvSt = fvs[s, t, 0]; FeatureVector prodFvTs = fvs[s, t, 1]; double prodProbSt = probs[s, t, 0]; double prodProbTs = probs[s, t, 1]; int type1 = m_pipe.Labeled ? staticTypes[s, t] : 0; int type2 = m_pipe.Labeled ? staticTypes[t, s] : 0; FeatureVector ntFvS01 = ntFvs[s, type1, 0, 1]; FeatureVector ntFvS10 = ntFvs[s, type2, 1, 0]; FeatureVector ntFvT00 = ntFvs[t, type1, 0, 0]; FeatureVector ntFvT11 = ntFvs[t, type2, 1, 1]; double ntProbS01 = ntProbs[s, type1, 0, 1]; double ntProbS10 = ntProbs[s, type2, 1, 0]; double ntProbT00 = ntProbs[t, type1, 0, 0]; double ntProbT11 = ntProbs[t, type2, 1, 1]; double prodProb = 0.0; if (true) { // case when R == S ParseForestItem[] b1 = pf.GetItems(s, s, 0, 0); ParseForestItem[] c1 = pf.GetItems(s + 1, t, 1, 0); if (!(b1 == null || c1 == null)) { FeatureVector prodFvSst = pf.Cat(fvsTrips[s, s, t], fvsSibs[s, t, 0]); double prodProbSst = probsTrips[s, s, t] + probsSibs[s, t, 0]; int[,] pairs = pf.GetKBestPairs(b1, c1); for (int k = 0; k < K; k++) { if (pairs[k, 0] == -1 || pairs[k, 1] == -1) break; int comp1 = pairs[k, 0]; int comp2 = pairs[k, 1]; double bc = b1[comp1].Prob + c1[comp2].Prob; // create sibling pair // create parent pair: S->T and S->(start,T) bc += prodProbSt + prodProbSst; FeatureVector fvFin = pf.Cat(prodFvSt, prodFvSst); if (m_pipe.Labeled) { bc += ntProbS01 + ntProbT00; fvFin = FeatureVector.Cat(ntFvS01, FeatureVector.Cat(ntFvT00, fvFin)); } pf.Add(s, s, t, type1, 0, 1, bc, fvFin, b1[comp1], c1[comp2]); } } // case when R == T b1 = pf.GetItems(s, t - 1, 0, 0); c1 = pf.GetItems(t, t, 1, 0); if (!(b1 == null || c1 == null)) { FeatureVector prodFvStt = pf.Cat(fvsTrips[t, t, s], fvsSibs[t, s, 0]); double prodProbStt = probsTrips[t, t, s] + probsSibs[t, s, 0]; int[,] pairs = pf.GetKBestPairs(b1, c1); for (int k = 0; k < K; k++) { if (pairs[k, 0] == -1 || pairs[k, 1] == -1) break; int comp1 = pairs[k, 0]; int comp2 = pairs[k, 1]; double bc = b1[comp1].Prob + c1[comp2].Prob; // create sibling pair // create parent pair: S->T and S->(start,T) bc += prodProbTs + prodProbStt; FeatureVector fvFin = pf.Cat(prodFvTs, prodFvStt); if (m_pipe.Labeled) { bc += ntProbT11 + ntProbS10; fvFin = FeatureVector.Cat(ntFvT11, FeatureVector.Cat(ntFvS10, fvFin)); } pf.Add(s, t, t, type2, 1, 1, bc, fvFin, b1[comp1], c1[comp2]); } } } for (int r = s; r < t; r++) { // First case - create sibling ParseForestItem[] b1 = pf.GetItems(s, r, 0, 0); ParseForestItem[] c1 = pf.GetItems(r + 1, t, 1, 0); if (!(b1 == null || c1 == null)) { int[,] pairs = pf.GetKBestPairs(b1, c1); for (int k = 0; k < K; k++) { if (pairs[k, 0] == -1 || pairs[k, 1] == -1) break; int comp1 = pairs[k, 0]; int comp2 = pairs[k, 1]; double bc = b1[comp1].Prob + c1[comp2].Prob; pf.Add(s, r, t, -1, 0, 2, bc, new FeatureVector(), b1[comp1], c1[comp2]); pf.Add(s, r, t, -1, 1, 2, bc, new FeatureVector(), b1[comp1], c1[comp2]); } } } for (int r = s + 1; r < t; r++) { // S -> (R,T) ParseForestItem[] b1 = pf.GetItems(s, r, 0, 1); ParseForestItem[] c1 = pf.GetItems(r, t, 0, 2); if (!(b1 == null || c1 == null)) { int[,] pairs = pf.GetKBestPairs(b1, c1); for (int k = 0; k < K; k++) { if (pairs[k, 0] == -1 || pairs[k, 1] == -1) break; int comp1 = pairs[k, 0]; int comp2 = pairs[k, 1]; double bc = b1[comp1].Prob + c1[comp2].Prob; bc += prodProbSt + probsTrips[s, r, t] + probsSibs[r, t, 1]; FeatureVector fv_fin = pf.Cat(prodFvSt, pf.Cat(fvsTrips[s, r, t], fvsSibs[r, t, 1])); if (m_pipe.Labeled) { bc += ntProbS01 + ntProbT00; fv_fin = FeatureVector.Cat(ntFvS01, FeatureVector.Cat(ntFvT00, fv_fin)); } pf.Add(s, r, t, type1, 0, 1, bc, fv_fin, b1[comp1], c1[comp2]); } } // T -> (R,S) b1 = pf.GetItems(s, r, 1, 2); c1 = pf.GetItems(r, t, 1, 1); if (!(b1 == null || c1 == null)) { int[,] pairs = pf.GetKBestPairs(b1, c1); for (int k = 0; k < K; k++) { if (pairs[k, 0] == -1 || pairs[k, 1] == -1) break; int comp1 = pairs[k, 0]; int comp2 = pairs[k, 1]; double bc = b1[comp1].Prob + c1[comp2].Prob; bc += prodProbTs + probsTrips[t, r, s] + probsSibs[r, s, 1]; FeatureVector fvFin = pf.Cat(prodFvTs, pf.Cat(fvsTrips[t, r, s], fvsSibs[r, s, 1])); if (m_pipe.Labeled) { bc += ntProbT11 + ntProbS10; fvFin = FeatureVector.Cat(ntFvT11, FeatureVector.Cat(ntFvS10, fvFin)); } pf.Add(s, r, t, type2, 1, 1, bc, fvFin, b1[comp1], c1[comp2]); } } } // Finish off pieces incom + Comp -> Comp for (int r = s; r <= t; r++) { if (r != s) { ParseForestItem[] b1 = pf.GetItems(s, r, 0, 1); ParseForestItem[] c1 = pf.GetItems(r, t, 0, 0); if (!(b1 == null || c1 == null)) { //continue; int[,] pairs = pf.GetKBestPairs(b1, c1); for (int k = 0; k < K; k++) { if (pairs[k, 0] == -1 || pairs[k, 1] == -1) break; int comp1 = pairs[k, 0]; int comp2 = pairs[k, 1]; double bc = b1[comp1].Prob + c1[comp2].Prob; if ( !pf.Add(s, r, t, -1, 0, 0, bc, new FeatureVector(), b1[comp1], c1[comp2])) break; } } } if (r != t) { ParseForestItem[] b1 = pf.GetItems(s, r, 1, 0); ParseForestItem[] c1 = pf.GetItems(r, t, 1, 1); if (!(b1 == null || c1 == null)) { //continue; int[,] pairs = pf.GetKBestPairs(b1, c1); for (int k = 0; k < K; k++) { if (pairs[k, 0] == -1 || pairs[k, 1] == -1) break; int comp1 = pairs[k, 0]; int comp2 = pairs[k, 1]; double bc = b1[comp1].Prob + c1[comp2].Prob; if ( !pf.Add(s, r, t, -1, 1, 0, bc, new FeatureVector(), b1[comp1], c1[comp2])) break; } } } } } } return pf.GetBestParses(); }
public virtual void WritePossibleFeatures(DependencyInstance inst, BinaryWriter writer) { var toks = inst.Sentence; var pos = inst.POS; var labs = inst.Labs; var posA = new string[pos.Length]; for (int i = 0; i < pos.Length; i++) { posA[i] = pos[i].SubstringWithIndex(0, 1); } try { for (int w1 = 0; w1 < toks.Length; w1++) { for (int w2 = w1 + 1; w2 < toks.Length; w2++) { for (int ph = 0; ph < 2; ph++) { bool attR = ph == 0 ? true : false; var childInt = attR ? w2 : w1; var parInt = attR ? w1 : w2; var prodFV = CreateFeatureVector(toks, pos, posA, w1, w2, attR, new FeatureVector()); foreach (Feature feature in prodFV.FVector) { if (feature.Index >= 0) writer.Write(feature.Index); } writer.Write(-2); } } } writer.Write(-3); if (Labeled) { for (int w1 = 0; w1 < toks.Length; w1++) { for (int t = 0; t < Types.Length; t++) { string type = Types[t]; for (int ph = 0; ph < 2; ph++) { bool attR = ph == 0 ? true : false; for (int ch = 0; ch < 2; ch++) { bool child = ch == 0 ? true : false; var prodFV = CreateFeatureVector(toks, pos, posA, w1, type, attR, child, new FeatureVector()); foreach (Feature feature in prodFV.FVector) { if (feature.Index >= 0) writer.Write(feature.Index); } writer.Write(-2); } } } } writer.Write(-3); } foreach (Feature feature in inst.Fv.FVector) { writer.Write(feature.Index); } writer.Write(-4); writer.Write(inst.Sentence.Length); foreach (string s in inst.Sentence) { writer.Write(s); } writer.Write(-5); writer.Write(inst.POS.Length); foreach (string s in inst.POS) { writer.Write(s); } writer.Write(-6); writer.Write(inst.Labs.Length); foreach (string s in inst.Labs) { writer.Write(s); } writer.Write(-7); writer.Write(inst.ActParseTree); writer.Write(-1); } catch (IOException) { } }
public DependencyInstance ReadFeatureVector(BinaryReader reader, DependencyInstance inst, FeatureVector[,,] fvs, double[,,] probs, FeatureVector[,,,] ntFvs, double[,,,] ntProbs, Parameters parameters) { int length = inst.Length; // Get production crap. for (int w1 = 0; w1 < length; w1++) { for (int w2 = w1 + 1; w2 < length; w2++) { for (int ph = 0; ph < 2; ph++) { var prodFV = new FeatureVector(); int indx = reader.ReadInt32(); while (indx != -2) { AddNewFeature(indx, 1.0, prodFV); indx = reader.ReadInt32(); } double prodProb = parameters.GetScore(prodFV); fvs[w1, w2, ph] = prodFV; probs[w1, w2, ph] = prodProb; } } } int last = reader.ReadInt32(); if (last != -3) { Console.WriteLine("Error reading file."); throw new Exception("Bad File Format"); } if (Labeled) { for (int w1 = 0; w1 < length; w1++) { for (int t = 0; t < Types.Length; t++) { string type = Types[t]; for (int ph = 0; ph < 2; ph++) { for (int ch = 0; ch < 2; ch++) { var prodFV = new FeatureVector(); int indx = reader.ReadInt32(); while (indx != -2) { AddNewFeature(indx, 1.0, prodFV); indx = reader.ReadInt32(); } double ntProb = parameters.GetScore(prodFV); ntFvs[w1, t, ph, ch] = prodFV; ntProbs[w1, t, ph, ch] = ntProb; } } } } last = reader.ReadInt32(); if (last != -3) { Console.WriteLine("Error reading file."); throw new Exception("Bad File Format"); } } var nfv = new FeatureVector(); int next = reader.ReadInt32(); while (next != -4) { AddNewFeature(next, 1.0, nfv); next = reader.ReadInt32(); } string[] toks = null; string[] pos = null; string[] labs = null; string actParseTree = null; try { int len = reader.ReadInt32(); toks = new string[len]; for (int i = 0; i < len; i++) { toks[i] = reader.ReadString(); } next = reader.ReadInt32(); len = reader.ReadInt32(); pos = new string[len]; for (int i = 0; i < len; i++) { pos[i] = reader.ReadString(); } next = reader.ReadInt32(); len = reader.ReadInt32(); labs = new string[len]; for (int i = 0; i < len; i++) { labs[i] = reader.ReadString(); } next = reader.ReadInt32(); actParseTree = reader.ReadString(); next = reader.ReadInt32(); } catch (Exception e) { Console.WriteLine("Error reading file."); throw new Exception("Bad File Format"); } if (next != -1) { Console.WriteLine("Error reading file."); throw new Exception("Bad File Format"); } var pti = new DependencyInstance(toks, pos, labs, nfv); pti.ActParseTree = actParseTree; return pti; }
public DependencyInstance[] CreateInstances(string fileName, string featFileName) { CreateAlphabet(fileName); Console.WriteLine("Num Features: " + DataAlphabet.Count); var reader = new StreamReader(new FileStream(fileName, FileMode.Open), Encoding.UTF8); string[][] lines = ReadLines(reader); var lt = new List<object>(); BinaryWriter bWriter = CreateForest ? new BinaryWriter(new FileStream(featFileName, FileMode.Create)) //In doubt : null; int num1 = 0; while (lines != null) { // Console.WriteLine("Creating Feature Vector Instance: " + num1); string[] toks = lines[0]; string[] pos = lines[1]; string[] labs = lines[2]; string[] deps = lines[3]; var deps1 = new int[deps.Length]; for (int i = 0; i < deps.Length; i++) deps1[i] = int.Parse(deps[i]); FeatureVector fv = CreateFeatureVector(toks, pos, labs, deps1); var pti_ = new DependencyInstance(toks, pos, labs, fv); string spans = ""; for (int i = 1; i < deps.Length; i++) { spans += deps[i] + "|" + i + ":" + TypeAlphabet.LookupIndex(labs[i]) + " "; } pti_.ActParseTree = spans.Trim(); if (CreateForest) WritePossibleFeatures(pti_, bWriter); pti_ = null; lt.Add(new DependencyInstance(toks.Length)); lines = ReadLines(reader); num1++; } CloseAlphabets(); var pti = new DependencyInstance[lt.Count]; for (int i = 0; i < pti.Length; i++) { pti[i] = (DependencyInstance) lt[i]; } if (CreateForest) bWriter.Close(); reader.Close(); return pti; }
// TODO: sina: rename it to ReadFeatureVector public DependencyInstance GetFeatureVector(BinaryReader reader, DependencyInstance inst, FeatureVector[,,] fvs, double[,,] probs, FeatureVector[,,] fvsTrips, double[,,] probsTrips, FeatureVector[,,] fvsSibs, double[,,] probsSibs, FeatureVector[,,,] ntFvs, double[,,,] ntProbs, Parameters @params) { int length = inst.Length; // Get production crap. for (int w1 = 0; w1 < length; w1++) { for (int w2 = w1 + 1; w2 < length; w2++) { for (int ph = 0; ph < 2; ph++) { var prodFV = new FeatureVector(); int indx = reader.ReadInt32(); while (indx != -2) { AddNewFeature(indx, 1.0, prodFV); indx = reader.ReadInt32(); } double prodProb = @params.GetScore(prodFV); fvs[w1, w2, ph] = prodFV; probs[w1, w2, ph] = prodProb; } } } int last = reader.ReadInt32(); if (last != -3) { Console.WriteLine("Error reading file."); throw new Exception("Bad File Format"); } if (Labeled) { for (int w1 = 0; w1 < length; w1++) { for (int t = 0; t < Types.Length; t++) { string type = Types[t]; for (int ph = 0; ph < 2; ph++) { for (int ch = 0; ch < 2; ch++) { var prodFV = new FeatureVector(); int indx = reader.ReadInt32(); while (indx != -2) { AddNewFeature(indx, 1.0, prodFV); indx = reader.ReadInt32(); } double ntProb = @params.GetScore(prodFV); ntFvs[w1, t, ph, ch] = prodFV; ntProbs[w1, t, ph, ch] = ntProb; } } } } last = reader.ReadInt32(); if (last != -3) { Console.WriteLine("Error reading file."); throw new Exception("Bad File Format"); } } for (int w1 = 0; w1 < length; w1++) { for (int w2 = w1; w2 < length; w2++) { for (int w3 = w2 + 1; w3 < length; w3++) { var prodFV = new FeatureVector(); int indx = reader.ReadInt32(); while (indx != -2) { AddNewFeature(indx, 1.0, prodFV); indx = reader.ReadInt32(); } double prodProb = @params.GetScore(prodFV); fvsTrips[w1, w2, w3] = prodFV; probsTrips[w1, w2, w3] = prodProb; } } for (int w2 = w1; w2 >= 0; w2--) { for (int w3 = w2 - 1; w3 >= 0; w3--) { var prodFV = new FeatureVector(); int indx = reader.ReadInt32(); while (indx != -2) { AddNewFeature(indx, 1.0, prodFV); indx = reader.ReadInt32(); } double prodProb = @params.GetScore(prodFV); fvsTrips[w1, w2, w3] = prodFV; probsTrips[w1, w2, w3] = prodProb; } } } last = reader.ReadInt32(); if (last != -3) { Console.WriteLine("Error reading file."); throw new Exception("Bad File Format"); } for (int w1 = 0; w1 < length; w1++) { for (int w2 = 0; w2 < length; w2++) { for (int wh = 0; wh < 2; wh++) { if (w1 != w2) { var prodFV = new FeatureVector(); int indx = reader.ReadInt32(); while (indx != -2) { AddNewFeature(indx, 1.0, prodFV); indx = reader.ReadInt32(); } double prodProb = @params.GetScore(prodFV); fvsSibs[w1, w2, wh] = prodFV; probsSibs[w1, w2, wh] = prodProb; } } } } last = reader.ReadInt32(); if (last != -3) { Console.WriteLine("Error reading file."); throw new Exception("Bad File Format"); } var nfv = new FeatureVector(); int next = reader.ReadInt32(); while (next != -4) { AddNewFeature(next, 1.0, nfv); next = reader.ReadInt32(); } string[] toks = null; string[] pos = null; string[] labs = null; string actParseTree = null; try { int len = reader.ReadInt32(); //Added by MSR toks = new string[len]; for (int i = 0; i < len; i++) { toks[i] = reader.ReadString(); } //next = reader.ReadInt32(); len = reader.ReadInt32(); //Added by MSR pos = new string[len]; for (int i = 0; i < len; i++) { pos[i] = reader.ReadString(); } next = reader.ReadInt32(); len = reader.ReadInt32(); //Added by MSR labs = new string[len]; for (int i = 0; i < len; i++) { labs[i] = reader.ReadString(); } next = reader.ReadInt32(); actParseTree = reader.ReadString(); next = reader.ReadInt32(); } catch (Exception e) { // TODO: sina: A library MUST NOT call Environment.Exit in any form // throw exception instead. Console.WriteLine("Error reading file."); throw new Exception("Bad File Format"); } if (next != -1) { // TODO: sina: A library MUST NOT call Environment.Exit in any form // throw exception instead. Console.WriteLine("Error reading file."); throw new Exception("Bad File Format"); } var pti = new DependencyInstance(toks, pos, labs, nfv); pti.ActParseTree = actParseTree; return pti; }
// static Type for each edge: run time O(n^3 + Tn^2) T is number of Types public object[,] DecodeProjective(DependencyInstance inst, FeatureVector[,,] fvs, double[,,] probs, FeatureVector[,,,] ntFvs, double[,,,] ntProbs, int K) { string[] toks = inst.Sentence; string[] pos = inst.POS; int[,] staticTypes = null; if (m_pipe.Labeled) { staticTypes = GetTypes(ntProbs, toks.Length); } var pf = new KBestParseForest(0, toks.Length - 1, inst, K); for (int s = 0; s < toks.Length; s++) { pf.Add(s, -1, 0, 0.0, new FeatureVector()); pf.Add(s, -1, 1, 0.0, new FeatureVector()); } for (int j = 1; j < toks.Length; j++) { for (int s = 0; s < toks.Length && s + j < toks.Length; s++) { int t = s + j; FeatureVector prodFvSt = fvs[s, t, 0]; FeatureVector prodFvTs = fvs[s, t, 1]; double prodProbSt = probs[s, t, 0]; double prodProbTs = probs[s, t, 1]; int type1 = m_pipe.Labeled ? staticTypes[s, t] : 0; int type2 = m_pipe.Labeled ? staticTypes[t, s] : 0; FeatureVector ntFvS01 = ntFvs[s, type1, 0, 1]; FeatureVector ntFvS10 = ntFvs[s, type2, 1, 0]; FeatureVector ntFvT00 = ntFvs[t, type1, 0, 0]; FeatureVector ntFvT11 = ntFvs[t, type2, 1, 1]; double ntProbS01 = ntProbs[s, type1, 0, 1]; double ntProbS10 = ntProbs[s, type2, 1, 0]; double ntProbT00 = ntProbs[t, type1, 0, 0]; double ntProbT11 = ntProbs[t, type2, 1, 1]; for (int r = s; r <= t; r++) { if (r != t) { ParseForestItem[] b1 = pf.GetItems(s, r, 0, 0); ParseForestItem[] c1 = pf.GetItems(r + 1, t, 1, 0); if (b1 != null && c1 != null) { int[,] pairs = pf.GetKBestPairs(b1, c1); for (int k = 0; k < pairs.GetLength(0); k++) { if (pairs[k, 0] == -1 || pairs[k, 1] == -1) break; int comp1 = pairs[k, 0]; int comp2 = pairs[k, 1]; double bc = b1[comp1].Prob + c1[comp2].Prob; double probFin = bc + prodProbSt; FeatureVector fv_fin = prodFvSt; if (m_pipe.Labeled) { fv_fin = FeatureVector.Cat(ntFvS01, FeatureVector.Cat(ntFvT00, fv_fin)); probFin += ntProbS01 + ntProbT00; } pf.Add(s, r, t, type1, 0, 1, probFin, fv_fin, b1[comp1], c1[comp2]); probFin = bc + prodProbTs; fv_fin = prodFvTs; if (m_pipe.Labeled) { fv_fin = FeatureVector.Cat(ntFvT11, FeatureVector.Cat(ntFvS10, fv_fin)); probFin += ntProbT11 + ntProbS10; } pf.Add(s, r, t, type2, 1, 1, probFin, fv_fin, b1[comp1], c1[comp2]); } } } } for (int r = s; r <= t; r++) { if (r != s) { ParseForestItem[] b1 = pf.GetItems(s, r, 0, 1); ParseForestItem[] c1 = pf.GetItems(r, t, 0, 0); if (b1 != null && c1 != null) { int[,] pairs = pf.GetKBestPairs(b1, c1); for (int k = 0; k < pairs.GetLength(0); k++) { if (pairs[k, 0] == -1 || pairs[k, 1] == -1) break; int comp1 = pairs[k, 0]; int comp2 = pairs[k, 1]; double bc = b1[comp1].Prob + c1[comp2].Prob; if (!pf.Add(s, r, t, -1, 0, 0, bc, new FeatureVector(), b1[comp1], c1[comp2])) { break; } } } } if (r != t) { ParseForestItem[] b1 = pf.GetItems(s, r, 1, 0); ParseForestItem[] c1 = pf.GetItems(r, t, 1, 1); if (b1 != null && c1 != null) { int[,] pairs = pf.GetKBestPairs(b1, c1); for (int k = 0; k < pairs.GetLength(0); k++) { if (pairs[k, 0] == -1 || pairs[k, 1] == -1) break; int comp1 = pairs[k, 0]; int comp2 = pairs[k, 1]; double bc = b1[comp1].Prob + c1[comp2].Prob; if (!pf.Add(s, r, t, -1, 1, 0, bc, new FeatureVector(), b1[comp1], c1[comp2])) break; } } } } } } return pf.GetBestParses(); }
public DependencyInstance CreateInstance(StreamReader reader) { string[][] lines = ReadLines(reader); if (lines == null) return null; string[] toks = lines[0]; string[] pos = lines[1]; string[] labs = lines[2]; string[] deps = lines[3]; var deps1 = new int[deps.Length]; for (int i = 0; i < deps.Length; i++) deps1[i] = int.Parse(deps[i]); FeatureVector fv = CreateFeatureVector(toks, pos, labs, deps1); var pti = new DependencyInstance(toks, pos, labs, fv); string spans = ""; for (int i = 1; i < deps.Length; i++) { spans += deps[i] + "|" + i + ":" + TypeAlphabet.LookupIndex(labs[i]) + " "; } pti.ActParseTree = spans.Trim(); return pti; }