Beispiel #1
0
    static void ParsePlainTextFiles(Matrix Model, Set files, IOrthography lex, Set skipList)
    {
        bool IsStopWord(string w)
        {
            return(skipList != null
                ? (skipList[w] != null)
                : false);
        }

        foreach (string file in (IEnumerable <string>)files)
        {
            Console.Write($"Reading {file}...\r\n");
            string textFragment = File.ReadAllText(file);
            foreach (var t
                     in PlainText.ForEach(textFragment, 0, textFragment.Length, 0))
            {
                if (t.Type == PlainTextTag.TAG)
                {
                    var id = lex.GetKey(t.TextFragment.Substring(t.StartIndex, t.Length));
                    if (!IsStopWord(id))
                    {
                        var it = Model.Push(id);
                        it.Add(1d / CBOW.THRESHOLD);
                    }
                }
            }
        }
    }
Beispiel #2
0
 public void Build()
 {
     Model.Clear();
     foreach (var file in Files)
     {
         Console.WriteLine($"Reading {Tools.GetShortPath(file)}...");
         string buff = File.ReadAllText(file);
         foreach (var tok in PlainText.ForEach(buff))
         {
             if (tok.Type == PlainTextTag.TEXT)
             {
                 var id = Orthography.GetKey(buff.Substring(
                                                 tok.StartIndex, tok.Length));
                 if (id != null && id.Length > 0)
                 {
                     var y = Model.Push(id);
                     y++;
                 }
             }
         }
     }
     Console.WriteLine($"Done.");
 }
Beispiel #3
0
    static Matrix MakeWhiteList(string file, IOrthography lex, int hashSize)
    {
        var W = new Matrix(hashSize);

        Console.Write($"\r\nReading {file}...\r\n\r\n");
        string textFragment = File.ReadAllText(file);

        foreach (var t in PlainText.ForEach(textFragment, 0, textFragment.Length, 0))
        {
            if (t.Type == PlainTextTag.TAG)
            {
                W.Push(lex.GetKey(t.TextFragment.Substring(t.StartIndex, t.Length)));
            }
        }
        return(W);
    }
Beispiel #4
0
    static Set MakeStops(int hashSize, string textFragment, IOrthography lex)
    {
        var S = new Set();

        if (textFragment != null)
        {
            foreach (var t in PlainText.ForEach(textFragment, 0, textFragment.Length, 0))
            {
                if (t.Type == PlainTextTag.TAG)
                {
                    S.Push(lex.GetKey(t.TextFragment.Substring(t.StartIndex, t.Length)));
                }
            }
        }
        return(S);
    }
Beispiel #5
0
    static void TrainMikolovModel(Set sourceFiles, IOrthography lex,
                                  Matrix Model, Action <double> SetLoss, Func <bool> HasCtrlBreak)
    {
        if (Model == null)
        {
            Console.WriteLine("Model not loaded.");
            return;
        }
        Vector[] negDistr = System.Ai.CBOW.CreateNegDistr(
            Model, SHUFFLE);
        Thread[] threads         = new Thread[Environment.ProcessorCount * 2];
        int      numberOfThreads = 0,
                 verbOut = 0;

        for (var t = 0; t < threads.Length; t++)
        {
            threads[t] = new Thread(() => {
                Interlocked.Increment(ref numberOfThreads);
                try {
                    for (int iter = 0; iter < GENS; iter++)
                    {
                        if (HasCtrlBreak != null && HasCtrlBreak())
                        {
                            break;
                        }
                        string[] Shuffle = ((IEnumerable <string>)sourceFiles).ToArray();
                        Random.Shuffle(Shuffle, Shuffle.Length);
                        foreach (string file in Shuffle)
                        {
                            if (HasCtrlBreak != null && HasCtrlBreak())
                            {
                                return;
                            }
                            try {
                                Console.Write($"\r\nReading {file}...\r\n");
                                var textFragment = File.ReadAllText(file);
                                string[] slidingWindow
                                    = new string[2 * System.Ai.CBOW.WINDOW + 1];
                                foreach (var q
                                         in PlainText.ForEach(textFragment, 0, textFragment.Length, 1 + (slidingWindow.Length >> 1)))
                                {
                                    if (HasCtrlBreak != null && HasCtrlBreak())
                                    {
                                        return;
                                    }
                                    var vocab = q.Type == PlainTextTag.TAG
                                        ? lex.GetKey(textFragment.Substring(
                                                         q.StartIndex,
                                                         q.Length))
                                        : null;
                                    for (int i = 0; i < slidingWindow.Length; i++)
                                    {
                                        if (i == slidingWindow.Length - 1)
                                        {
                                            slidingWindow[i] = vocab;
                                        }
                                        else
                                        {
                                            slidingWindow[i] = slidingWindow[i + 1];
                                        }
                                    }
                                    SetLoss(System.Ai.CBOW.learnWindow(Model,
                                                                       negDistr, slidingWindow,
                                                                       iter,
                                                                       HasCtrlBreak, ref verbOut));
                                }
                                Thread.Sleep(3000 + Random.Next(3000));
                            } finally {
                            }
                        }
                    }
                } finally {
                    Interlocked.Decrement(ref numberOfThreads);
                }
                Console.Write($"[{Thread.CurrentThread.ManagedThreadId}] stopped...\r\n");
            });
        }
        foreach (var t in threads)
        {
            t.Start();
        }
        foreach (var t in threads)
        {
            t.Join();
        }
        Debug.Assert(numberOfThreads == 0);
    }
Beispiel #6
0
        public static Word[] RunFullCosineSort(IOrthography lex, Matrix <Word> Model, string Q, int max)
        {
            if (Model == null || string.IsNullOrWhiteSpace(Q))
            {
                Console.ForegroundColor = ConsoleColor.Yellow;
                Console.WriteLine("Model not loaded.\r\n");
                Console.ResetColor();
                Console.WriteLine("See '--load' command for more info...\r\n");
                return(null);
            }
            float[] Re   = new float[CBOW.DIMS];
            float   norm = 0;
            var     sign = +1;

            foreach (var tok in PlainText.ForEach(Q, 0, Q.Length, 0))
            {
                string wi = lex.GetKey(tok.TextFragment.Substring(tok.StartIndex, tok.Length));
                if (wi == "+")
                {
                    sign = +1;
                }
                else if (wi == "-")
                {
                    sign = -1;
                }
                else
                {
                    var vec = Model[wi];
                    if (vec != null)
                    {
                        Debug.Assert(vec.Elements.Length == Re.Length);
                        for (var j = 0; j < Re.Length; j++)
                        {
                            Re[j] += sign * vec.Elements[j].Re;
                        }
                        norm++;
                    }
                    else
                    {
                        Console.ForegroundColor = ConsoleColor.Yellow;
                        Console.WriteLine($"'{wi}' not found.");
                        Console.ResetColor();
                    }
                }
            }
            if (norm > 0)
            {
                for (var j = 0; j < Re.Length; j++)
                {
                    Re[j] /= (float)norm;
                }
            }
            Word[] output = CBOW.Predict(Model, Re, max);
            Array.Sort(output,
                       (a, b) => Dot.CompareTo(a, b));
            Console.WriteLine();
            Console.WriteLine(" [" + string.Join(",", Re.Select(re => Math.Round(re, 4)).Take(7)) + "...]");
            Console.WriteLine();
            int len = 0;

            for (int i = output.Length - 1; i >= 0; i--)
            {
                Word n = output[i];
                if (n != null)
                {
                    string str = n.Id;
                    var    it  = Model[n.Id];
                    if (it != null)
                    {
                        // if (it.Count > 0) {
                        //     var best = it.ArgMax();
                        //     if (best != null) {
                        //         str = best.Id;
                        //     }
                        // }
                    }
                    if (len + str.Length > 37 /* break like if does not fit */)
                    {
                        Console.WriteLine(
                            output.Length <= 31
                                ? $" {str} : {n.ToString(z: true)}"
                                : $" {str}");
                        len = 0;
                    }
                    else
                    {
                        Console.Write(
                            output.Length <= 31
                                ? $" {str} : {n.ToString(z: true)}"
                                : $" {str}");
                        len += str.Length;
                    }
                }
            }
            Console.WriteLine();
            return(output);
        }