public void StoreFourProximity(int max_four_count, string word1, string word2, string word3, string word4, float proximity) { int i1 = RegisterWord(word1); int i2 = RegisterWord(word2); int i3 = RegisterWord(word3); int i4 = RegisterWord(word4); SoftFluent.Int128 i1234 = Words2Foursome(i1, i2, i3, i4); float sum_prox = 0; if (!four2proximity.TryGetValue(i1234, out sum_prox)) { if (four2count.Count < max_four_count) { four2count.Add(i1234, 1); four2proximity.Add(i1234, proximity); } } else { four2proximity[i1234] = sum_prox + proximity; four2count[i1234] = four2count[i1234] + 1; } return; }
static SoftFluent.Int128 Words2Triple(int i1, int i2, int i3) { SoftFluent.Int128 x1 = i1; SoftFluent.Int128 x2 = i2; SoftFluent.Int128 x3 = i3; return(x1 | (x2 << 32) | (x3 << 64)); /* * if (i1 < i2 && i2 < i3) // i1,i2,i3 * { * return x1 | (x2 << 32) | (x3 << 64); * } * else if (i1 < i3 && i3 < i2) // i1,i3,i2 * { * return x1 | (x3 << 32) | (x2 << 64); * } * else if (i2 < i1 && i1 < i3) // i2,i1,i3 * { * return x2 | (x1 << 32) | (x3 << 64); * } * else if (i2 < i3 && i3 < i1) // i2,i3,i1 * { * return x2 | (x3 << 32) | (x1 << 64); * } * else if (i3 < i1 && i1 < i2) // i3,i1,i2 * { * return x3 | (x1 << 32) | (x2 << 64); * } * else // i3,i2,i1 * { * return x3 | (x2 << 32) | (x1 << 64); * } */ }
int GetFourCount1(SoftFluent.Int128 i1234) { int c; four2counts1.TryGetValue(i1234, out c); return(c); }
public void StoreTripleProximity(int max_triple_count, string word1, string word2, string word3, float proximity) { int i1 = RegisterWord(word1); int i2 = RegisterWord(word2); int i3 = RegisterWord(word3); SoftFluent.Int128 i123 = Words2Triple(i1, i2, i3); float sum_prox = 0; if (!triple2proximity.TryGetValue(i123, out sum_prox)) { // TODO: можно убрать тройки с единичной встречаемостью, если предел кол-ва достигнут ... if (triple2count.Count < max_triple_count) { triple2count.Add(i123, 1); triple2proximity.Add(i123, proximity); } } else { triple2proximity[i123] = sum_prox + proximity; triple2count[i123] = triple2count[i123] + 1; } return; }
int GetTripleCount1(SoftFluent.Int128 i123) { int c; triple2counts1.TryGetValue(i123, out c); return(c); }
public void StoreFoursProximityDataset(string result_path) { Console.WriteLine("Storing {0} quadruples as dataset {1}...", four2count.Count, result_path); double N4 = four2count.Select(z => (double)z.Value).Sum(); // общая частота всех четверок double N1 = word2freq.Select(z => (double)z.Value).Sum(); // общая частота всех слов List <Tuple <SoftFluent.Int128, float> > four_mi = new List <Tuple <SoftFluent.Int128, float> >(); foreach (SoftFluent.Int128 four in (four2count.Select(z => z.Key))) { float n1234 = four2proximity[four]; int i4 = (int)(four >> 96).GetLow32(); int i3 = (int)(four >> 64).GetLow32(); int i2 = (int)(four >> 32).GetLow32(); int i1 = (int)(four).GetLow32(); // mutual information для этой пары слов double a = n1234 / N4; double f1 = word2freq[all_words[i1]] / N1; double f2 = word2freq[all_words[i2]] / N1; double f3 = word2freq[all_words[i3]] / N1; double f4 = word2freq[all_words[i4]] / N1; double mutual_information = a * Math.Log(a / (f1 * f2 * f3 * f4)); four_mi.Add(new Tuple <SoftFluent.Int128, float>(four, (float)mutual_information)); } using (System.IO.StreamWriter wrt = new System.IO.StreamWriter(result_path)) { foreach (var rec in four_mi.OrderByDescending(z => z.Item2)) { SoftFluent.Int128 four = rec.Item1; float n1234 = four2proximity[four]; int i4 = (int)(four >> 96).GetLow32(); int i3 = (int)(four >> 64).GetLow32(); int i2 = (int)(four >> 32).GetLow32(); int i1 = (int)(four).GetLow32(); double mutual_information = rec.Item2; wrt.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", all_words[i1], all_words[i2], all_words[i3], all_words[i4] , mutual_information.ToString(System.Globalization.CultureInfo.InvariantCulture) ); } } return; }
public void StoreTriplesProximityDataset(string result_path) { Console.WriteLine("Storing {0} triples as dataset {1}...", triple2count.Count, result_path); double N3 = triple2count.Select(z => (double)z.Value).Sum(); // общая частота всех троек double N1 = word2freq.Select(z => (double)z.Value).Sum(); // общая частота всех слов List <Tuple <SoftFluent.Int128, float> > triple_mi = new List <Tuple <SoftFluent.Int128, float> >(); foreach (SoftFluent.Int128 triple in (triple2count.Select(z => z.Key))) { float n123 = triple2proximity[triple]; int i3 = (int)(triple >> 64).GetLow32(); int i2 = (int)(triple >> 32).GetLow32(); int i1 = (int)(triple).GetLow32(); // mutual information для этой пары слов double a = n123 / N3; double f1 = word2freq[all_words[i1]] / N1; double f2 = word2freq[all_words[i2]] / N1; double f3 = word2freq[all_words[i3]] / N1; double mutual_information = a * Math.Log(a / (f1 * f2 * f3)); triple_mi.Add(new Tuple <SoftFluent.Int128, float>(triple, (float)mutual_information)); } using (System.IO.StreamWriter wrt = new System.IO.StreamWriter(result_path)) { foreach (var rec in triple_mi.OrderByDescending(z => z.Item2)) { SoftFluent.Int128 triple = rec.Item1; float n123 = triple2proximity[triple]; int i3 = (int)(triple >> 64).GetLow32(); int i2 = (int)(triple >> 32).GetLow32(); int i1 = (int)(triple).GetLow32(); double mutual_information = rec.Item2; wrt.WriteLine("{0}\t{1}\t{2}\t{3}", all_words[i1], all_words[i2], all_words[i3] , mutual_information.ToString(System.Globalization.CultureInfo.InvariantCulture) ); } } return; }
// -------------------------------------------------- void StoreWordFourLink(int max_four_count, string word1, string word2, string word3, string word4, bool linked) { int i1 = RegisterWord(word1); int i2 = RegisterWord(word2); int i3 = RegisterWord(word3); int i4 = RegisterWord(word4); SoftFluent.Int128 i1234 = Words2Foursome(i1, i2, i3, i4); int counts; if (linked) { if (four2counts1_buf.TryGetValue(i1234, out counts)) { four2counts1_buf[i1234] = counts + 1; } else { four2counts1_buf.Add(i1234, 1); if (four2counts1_buf.Count > MAX_IN_BUF) { FlushNGrams(max_four_count); } } } else { if (four2counts0_buf.TryGetValue(i1234, out counts)) { four2counts0_buf[i1234] = counts + 1; } else { four2counts0_buf.Add(i1234, 1); if (four2counts0_buf.Count > MAX_IN_BUF) { FlushNGrams(max_four_count); } } } return; }
// -------------------------------------------------- void StoreWordTripleLink(int max_triple_count, string word1, string word2, string word3, bool linked) { int i1 = RegisterWord(word1); int i2 = RegisterWord(word2); int i3 = RegisterWord(word3); SoftFluent.Int128 i123 = Words2Triple(i1, i2, i3); int counts; if (linked) { if (triple2counts1_buf.TryGetValue(i123, out counts)) { triple2counts1_buf[i123] = counts + 1; } else { triple2counts1_buf.Add(i123, 1); if (triple2counts1_buf.Count > MAX_IN_BUF) { FlushNGrams(max_triple_count); } } } else { if (triple2counts0_buf.TryGetValue(i123, out counts)) { triple2counts0_buf[i123] = counts + 1; } else { triple2counts0_buf.Add(i123, 1); if (triple2counts0_buf.Count > MAX_IN_BUF) { FlushNGrams(max_triple_count); } } } return; }
int GetFourCount(SoftFluent.Int128 i1234) { return(GetFourCount0(i1234) + GetFourCount1(i1234)); }
int GetTripleCount(SoftFluent.Int128 i123) { return(GetTripleCount0(i123) + GetTripleCount1(i123)); }