Example #1
0
    public void StoreFourProximity(int max_four_count, string word1, string word2, string word3, string word4, float proximity)
    {
        int i1 = RegisterWord(word1);
        int i2 = RegisterWord(word2);
        int i3 = RegisterWord(word3);
        int i4 = RegisterWord(word4);

        SoftFluent.Int128 i1234 = Words2Foursome(i1, i2, i3, i4);

        float sum_prox = 0;

        if (!four2proximity.TryGetValue(i1234, out sum_prox))
        {
            if (four2count.Count < max_four_count)
            {
                four2count.Add(i1234, 1);
                four2proximity.Add(i1234, proximity);
            }
        }
        else
        {
            four2proximity[i1234] = sum_prox + proximity;
            four2count[i1234]     = four2count[i1234] + 1;
        }

        return;
    }
Example #2
0
    static SoftFluent.Int128 Words2Triple(int i1, int i2, int i3)
    {
        SoftFluent.Int128 x1 = i1;
        SoftFluent.Int128 x2 = i2;
        SoftFluent.Int128 x3 = i3;

        return(x1 | (x2 << 32) | (x3 << 64));

/*
 *      if (i1 < i2 && i2 < i3) // i1,i2,i3
 *      {
 *          return x1 | (x2 << 32) | (x3 << 64);
 *      }
 *      else if (i1 < i3 && i3 < i2) // i1,i3,i2
 *      {
 *          return x1 | (x3 << 32) | (x2 << 64);
 *      }
 *      else if (i2 < i1 && i1 < i3) // i2,i1,i3
 *      {
 *          return x2 | (x1 << 32) | (x3 << 64);
 *      }
 *      else if (i2 < i3 && i3 < i1) // i2,i3,i1
 *      {
 *          return x2 | (x3 << 32) | (x1 << 64);
 *      }
 *      else if (i3 < i1 && i1 < i2) // i3,i1,i2
 *      {
 *          return x3 | (x1 << 32) | (x2 << 64);
 *      }
 *      else // i3,i2,i1
 *      {
 *          return x3 | (x2 << 32) | (x1 << 64);
 *      }
 */
    }
Example #3
0
    int GetFourCount1(SoftFluent.Int128 i1234)
    {
        int c;

        four2counts1.TryGetValue(i1234, out c);
        return(c);
    }
Example #4
0
    public void StoreTripleProximity(int max_triple_count, string word1, string word2, string word3, float proximity)
    {
        int i1 = RegisterWord(word1);
        int i2 = RegisterWord(word2);
        int i3 = RegisterWord(word3);

        SoftFluent.Int128 i123 = Words2Triple(i1, i2, i3);

        float sum_prox = 0;

        if (!triple2proximity.TryGetValue(i123, out sum_prox))
        {
            // TODO: можно убрать тройки с единичной встречаемостью, если предел кол-ва достигнут ...

            if (triple2count.Count < max_triple_count)
            {
                triple2count.Add(i123, 1);
                triple2proximity.Add(i123, proximity);
            }
        }
        else
        {
            triple2proximity[i123] = sum_prox + proximity;
            triple2count[i123]     = triple2count[i123] + 1;
        }

        return;
    }
Example #5
0
    int GetTripleCount1(SoftFluent.Int128 i123)
    {
        int c;

        triple2counts1.TryGetValue(i123, out c);
        return(c);
    }
Example #6
0
    public void StoreFoursProximityDataset(string result_path)
    {
        Console.WriteLine("Storing {0} quadruples as dataset {1}...", four2count.Count, result_path);

        double N4 = four2count.Select(z => (double)z.Value).Sum(); // общая частота всех четверок
        double N1 = word2freq.Select(z => (double)z.Value).Sum();  // общая частота всех слов

        List <Tuple <SoftFluent.Int128, float> > four_mi = new List <Tuple <SoftFluent.Int128, float> >();

        foreach (SoftFluent.Int128 four in (four2count.Select(z => z.Key)))
        {
            float n1234 = four2proximity[four];

            int i4 = (int)(four >> 96).GetLow32();
            int i3 = (int)(four >> 64).GetLow32();
            int i2 = (int)(four >> 32).GetLow32();
            int i1 = (int)(four).GetLow32();

            // mutual information для этой пары слов
            double a = n1234 / N4;

            double f1 = word2freq[all_words[i1]] / N1;
            double f2 = word2freq[all_words[i2]] / N1;
            double f3 = word2freq[all_words[i3]] / N1;
            double f4 = word2freq[all_words[i4]] / N1;
            double mutual_information = a * Math.Log(a / (f1 * f2 * f3 * f4));

            four_mi.Add(new Tuple <SoftFluent.Int128, float>(four, (float)mutual_information));
        }


        using (System.IO.StreamWriter wrt = new System.IO.StreamWriter(result_path))
        {
            foreach (var rec in four_mi.OrderByDescending(z => z.Item2))
            {
                SoftFluent.Int128 four  = rec.Item1;
                float             n1234 = four2proximity[four];

                int i4 = (int)(four >> 96).GetLow32();
                int i3 = (int)(four >> 64).GetLow32();
                int i2 = (int)(four >> 32).GetLow32();
                int i1 = (int)(four).GetLow32();

                double mutual_information = rec.Item2;

                wrt.WriteLine("{0}\t{1}\t{2}\t{3}\t{4}", all_words[i1], all_words[i2], all_words[i3], all_words[i4]
                              , mutual_information.ToString(System.Globalization.CultureInfo.InvariantCulture)
                              );
            }
        }

        return;
    }
Example #7
0
    public void StoreTriplesProximityDataset(string result_path)
    {
        Console.WriteLine("Storing {0} triples as dataset {1}...", triple2count.Count, result_path);

        double N3 = triple2count.Select(z => (double)z.Value).Sum(); // общая частота всех троек
        double N1 = word2freq.Select(z => (double)z.Value).Sum();    // общая частота всех слов

        List <Tuple <SoftFluent.Int128, float> > triple_mi = new List <Tuple <SoftFluent.Int128, float> >();

        foreach (SoftFluent.Int128 triple in (triple2count.Select(z => z.Key)))
        {
            float n123 = triple2proximity[triple];

            int i3 = (int)(triple >> 64).GetLow32();
            int i2 = (int)(triple >> 32).GetLow32();
            int i1 = (int)(triple).GetLow32();

            // mutual information для этой пары слов
            double a = n123 / N3;

            double f1 = word2freq[all_words[i1]] / N1;
            double f2 = word2freq[all_words[i2]] / N1;
            double f3 = word2freq[all_words[i3]] / N1;
            double mutual_information = a * Math.Log(a / (f1 * f2 * f3));

            triple_mi.Add(new Tuple <SoftFluent.Int128, float>(triple, (float)mutual_information));
        }


        using (System.IO.StreamWriter wrt = new System.IO.StreamWriter(result_path))
        {
            foreach (var rec in triple_mi.OrderByDescending(z => z.Item2))
            {
                SoftFluent.Int128 triple = rec.Item1;
                float             n123   = triple2proximity[triple];

                int i3 = (int)(triple >> 64).GetLow32();
                int i2 = (int)(triple >> 32).GetLow32();
                int i1 = (int)(triple).GetLow32();

                double mutual_information = rec.Item2;

                wrt.WriteLine("{0}\t{1}\t{2}\t{3}", all_words[i1], all_words[i2], all_words[i3]
                              , mutual_information.ToString(System.Globalization.CultureInfo.InvariantCulture)
                              );
            }
        }

        return;
    }
Example #8
0
    // --------------------------------------------------

    void StoreWordFourLink(int max_four_count, string word1, string word2, string word3, string word4, bool linked)
    {
        int i1 = RegisterWord(word1);
        int i2 = RegisterWord(word2);
        int i3 = RegisterWord(word3);
        int i4 = RegisterWord(word4);

        SoftFluent.Int128 i1234 = Words2Foursome(i1, i2, i3, i4);

        int counts;

        if (linked)
        {
            if (four2counts1_buf.TryGetValue(i1234, out counts))
            {
                four2counts1_buf[i1234] = counts + 1;
            }
            else
            {
                four2counts1_buf.Add(i1234, 1);

                if (four2counts1_buf.Count > MAX_IN_BUF)
                {
                    FlushNGrams(max_four_count);
                }
            }
        }
        else
        {
            if (four2counts0_buf.TryGetValue(i1234, out counts))
            {
                four2counts0_buf[i1234] = counts + 1;
            }
            else
            {
                four2counts0_buf.Add(i1234, 1);

                if (four2counts0_buf.Count > MAX_IN_BUF)
                {
                    FlushNGrams(max_four_count);
                }
            }
        }

        return;
    }
Example #9
0
    // --------------------------------------------------

    void StoreWordTripleLink(int max_triple_count, string word1, string word2, string word3, bool linked)
    {
        int i1 = RegisterWord(word1);
        int i2 = RegisterWord(word2);
        int i3 = RegisterWord(word3);

        SoftFluent.Int128 i123 = Words2Triple(i1, i2, i3);

        int counts;

        if (linked)
        {
            if (triple2counts1_buf.TryGetValue(i123, out counts))
            {
                triple2counts1_buf[i123] = counts + 1;
            }
            else
            {
                triple2counts1_buf.Add(i123, 1);

                if (triple2counts1_buf.Count > MAX_IN_BUF)
                {
                    FlushNGrams(max_triple_count);
                }
            }
        }
        else
        {
            if (triple2counts0_buf.TryGetValue(i123, out counts))
            {
                triple2counts0_buf[i123] = counts + 1;
            }
            else
            {
                triple2counts0_buf.Add(i123, 1);

                if (triple2counts0_buf.Count > MAX_IN_BUF)
                {
                    FlushNGrams(max_triple_count);
                }
            }
        }

        return;
    }
Example #10
0
 int GetFourCount(SoftFluent.Int128 i1234)
 {
     return(GetFourCount0(i1234) + GetFourCount1(i1234));
 }
Example #11
0
 int GetTripleCount(SoftFluent.Int128 i123)
 {
     return(GetTripleCount0(i123) + GetTripleCount1(i123));
 }