示例#1
0
 static void LearnSpellingVariations(Matrix Model, Set files, IOrthography lex)
 {
     Console.Write($"Learning spelling variations...\r\n");
     // void ZeroOutVariations() {
     //     foreach (var it in Model) {
     //         it.Clear();
     //     }
     // }
     // ZeroOutVariations();
     foreach (string file in (IEnumerable <string>)files)
     {
         Console.Write($"Reading {file}...\r\n");
         // string textFragment = File.ReadAllText(file);
         // foreach (var t in PlainText.ForEach(textFragment, 0, textFragment.Length, 0)) {
         //     if (t.Type == PlainTextTag.TAG) {
         //         var s = t.TextFragment.Substring(t.StartIndex, t.Length);
         //         var it = Model[lex.GetKey(s)];
         //         if (it != null) {
         //             it.Push(s,
         //                 out Scalar spellingVariation);
         //             if (s.Length > 1 && char.IsLetter(s[0])
         //                     && char.IsUpper(s[0]) && !char.IsUpper(s[1])) {
         //                         /* Give more weight to capitalized words... */
         //                 spellingVariation.Add(2d / CBOW.THRESHOLD);
         //             } else {
         //                 spellingVariation.Add(1d / CBOW.THRESHOLD);
         //             }
         //         }
         //     }
         // }
     }
     Console.Write($"\r\nReady!\r\n");
 }
示例#2
0
    static void ParsePlainTextFiles(Matrix Model, Set files, IOrthography lex, Set skipList)
    {
        bool IsStopWord(string w)
        {
            return(skipList != null
                ? (skipList[w] != null)
                : false);
        }

        foreach (string file in (IEnumerable <string>)files)
        {
            Console.Write($"Reading {file}...\r\n");
            string textFragment = File.ReadAllText(file);
            foreach (var t
                     in PlainText.ForEach(textFragment, 0, textFragment.Length, 0))
            {
                if (t.Type == PlainTextTag.TAG)
                {
                    var id = lex.GetKey(t.TextFragment.Substring(t.StartIndex, t.Length));
                    if (!IsStopWord(id))
                    {
                        var it = Model.Push(id);
                        it.Add(1d / CBOW.THRESHOLD);
                    }
                }
            }
        }
    }
示例#3
0
文件: WithSubWords.cs 项目: azret/ml
 public WithSubWords(int capacity, int dims, string dir, string searchPattern,
                     SearchOption searchOption, IOrthography orthography)
 {
     Model       = new System.Ai.Model(capacity, dims);
     Orthography = orthography;
     Files       = Tools.GetFiles(dir,
                                  searchPattern,
                                  searchOption
                                  ).ToArray();
 }
示例#4
0
文件: Args.cs 项目: azret/ml
 /// <summary>
 /// Creates an instance of the <see cref="Args"/>
 /// </summary>
 private Args(int capacity, int gens, int dims, IOrthography orthography, string searchPath,
              string searchPattern, SearchOption searchOption)
 {
     Capacity      = capacity;
     Gens          = gens;
     Dims          = dims;
     Orthography   = orthography;
     SearchPath    = searchPath;
     SearchPattern = searchPattern;
     SearchOption  = searchOption;
 }
示例#5
0
文件: Mikolov.cs 项目: azret/ml
 public ContinuousBagOfWords(IModel model, string seachPath, string searchPattern,
                             SearchOption searchOption, IOrthography orthography, float learningRate,
                             int negatives, int window)
 {
     LearningRate = learningRate;
     Negatives    = negatives;
     Window       = window;
     Model        = model;
     Orthography  = orthography;
     Files        = Tools.GetFiles(
         seachPath,
         searchPattern,
         searchOption
         ).ToArray();
 }
示例#6
0
    static Matrix MakeWhiteList(string file, IOrthography lex, int hashSize)
    {
        var W = new Matrix(hashSize);

        Console.Write($"\r\nReading {file}...\r\n\r\n");
        string textFragment = File.ReadAllText(file);

        foreach (var t in PlainText.ForEach(textFragment, 0, textFragment.Length, 0))
        {
            if (t.Type == PlainTextTag.TAG)
            {
                W.Push(lex.GetKey(t.TextFragment.Substring(t.StartIndex, t.Length)));
            }
        }
        return(W);
    }
示例#7
0
    static Set MakeStops(int hashSize, string textFragment, IOrthography lex)
    {
        var S = new Set();

        if (textFragment != null)
        {
            foreach (var t in PlainText.ForEach(textFragment, 0, textFragment.Length, 0))
            {
                if (t.Type == PlainTextTag.TAG)
                {
                    S.Push(lex.GetKey(t.TextFragment.Substring(t.StartIndex, t.Length)));
                }
            }
        }
        return(S);
    }
示例#8
0
    static Matrix BuildFromPlainText(string sourcePath, string searchPattern, IOrthography lex, string outputFileName)
    {
        var Model = new Matrix(SIZE);

        Set SourceFiles = null,
            SkipList    = null;

        Matrix AllowList = null;

        var file = Path.ChangeExtension(outputFileName, ".allow");

        if (File.Exists(file))
        {
            AllowList = MakeWhiteList(file, lex, SIZE * 3);
        }

        file = Path.ChangeExtension(outputFileName, ".ignore");
        if (File.Exists(file))
        {
            SkipList = MakeStops(13452, File.ReadAllText(file), lex);
        }

        ParsePlainTextFiles(
            Model,
            SourceFiles = MakeFiles(new string[] { sourcePath },
                                    searchPattern, SearchOption.AllDirectories),
            lex,
            SkipList);

        if (AllowList.Count > 0 || CBOW.THRESHOLD > 0)
        {
            LimitToThreshold(AllowList, ref Model);
        }

        InitializeScores(Model);

        LearnSpellingVariations(
            Model,
            SourceFiles,
            lex);

        return(Model);
    }
示例#9
0
 public Hash CoOccurrences(Hash digrams, IOrthography lang, int window, params string[] paths)
 {
     if (window <= 0 || window > 17)
     {
         throw new ArgumentOutOfRangeException();
     }
     if (digrams == null)
     {
         digrams = Hash.Max();
     }
     Document.Scan(paths,
                   read: (s, emit) =>
     {
         string k = lang.Hash(s);
         if (k != null && k.Length > 0)
         {
             emit(k);
         }
     },
                   doc: (file, doc) =>
     {
         for (int i = 0; i < doc.Count; i++)
         {
             string w = doc[i];
             for (int j = i - ((window + 1) / 2); j < i + ((window + 1) / 2) + 1; j++)
             {
                 if (j >= 0 && j < doc.Count && i != j)
                 {
                     string c = doc[j];
                     if (w != c)
                     {
                         if (Gram.Compare(w, c) > 0)
                         {
                             string t = w;
                             w        = c;
                             c        = t;
                         }
                         string k = (w + " " + c);
                         lock (digrams)
                         {
                             float d = ((float)Math.Abs(i - j));
                             Gram g  = digrams.Get(k);
                             if (g == null)
                             {
                                 g = digrams.Put(k);
                                 if (g == null)
                                 {
                                     throw new OutOfMemoryException();
                                 }
                                 g.Vector = new float[] {
                                     0f
                                 };
                             }
                             System.Diagnostics.Debug.Assert(g.Vector != null && g.Vector.Length == 1);
                             g.Vector[0] += 0.5f / d;
                         }
                     }
                 }
             }
         }
         ;
     }
                   );
     return(digrams);
 }
示例#10
0
文件: Train.cs 项目: azret/mozart
    static void TrainMikolovModel(Set sourceFiles, IOrthography lex,
                                  Matrix Model, Action <double> SetLoss, Func <bool> HasCtrlBreak)
    {
        if (Model == null)
        {
            Console.WriteLine("Model not loaded.");
            return;
        }
        Vector[] negDistr = System.Ai.CBOW.CreateNegDistr(
            Model, SHUFFLE);
        Thread[] threads         = new Thread[Environment.ProcessorCount * 2];
        int      numberOfThreads = 0,
                 verbOut = 0;

        for (var t = 0; t < threads.Length; t++)
        {
            threads[t] = new Thread(() => {
                Interlocked.Increment(ref numberOfThreads);
                try {
                    for (int iter = 0; iter < GENS; iter++)
                    {
                        if (HasCtrlBreak != null && HasCtrlBreak())
                        {
                            break;
                        }
                        string[] Shuffle = ((IEnumerable <string>)sourceFiles).ToArray();
                        Random.Shuffle(Shuffle, Shuffle.Length);
                        foreach (string file in Shuffle)
                        {
                            if (HasCtrlBreak != null && HasCtrlBreak())
                            {
                                return;
                            }
                            try {
                                Console.Write($"\r\nReading {file}...\r\n");
                                var textFragment = File.ReadAllText(file);
                                string[] slidingWindow
                                    = new string[2 * System.Ai.CBOW.WINDOW + 1];
                                foreach (var q
                                         in PlainText.ForEach(textFragment, 0, textFragment.Length, 1 + (slidingWindow.Length >> 1)))
                                {
                                    if (HasCtrlBreak != null && HasCtrlBreak())
                                    {
                                        return;
                                    }
                                    var vocab = q.Type == PlainTextTag.TAG
                                        ? lex.GetKey(textFragment.Substring(
                                                         q.StartIndex,
                                                         q.Length))
                                        : null;
                                    for (int i = 0; i < slidingWindow.Length; i++)
                                    {
                                        if (i == slidingWindow.Length - 1)
                                        {
                                            slidingWindow[i] = vocab;
                                        }
                                        else
                                        {
                                            slidingWindow[i] = slidingWindow[i + 1];
                                        }
                                    }
                                    SetLoss(System.Ai.CBOW.learnWindow(Model,
                                                                       negDistr, slidingWindow,
                                                                       iter,
                                                                       HasCtrlBreak, ref verbOut));
                                }
                                Thread.Sleep(3000 + Random.Next(3000));
                            } finally {
                            }
                        }
                    }
                } finally {
                    Interlocked.Decrement(ref numberOfThreads);
                }
                Console.Write($"[{Thread.CurrentThread.ManagedThreadId}] stopped...\r\n");
            });
        }
        foreach (var t in threads)
        {
            t.Start();
        }
        foreach (var t in threads)
        {
            t.Join();
        }
        Debug.Assert(numberOfThreads == 0);
    }
示例#11
0
文件: Mikolov.cs 项目: azret/mozart
        static Matrix <Word> BuildFromPlainText(string sourcePath, string searchPattern, IOrthography lex, string outputFileName)
        {
            var Model = new Matrix <Word>((id, hashCode) => new Word(id, hashCode), SIZE);

            Set SourceFiles = null,
                Black       = null;

            var ignoreFile = Path.ChangeExtension(outputFileName, ".ignore");

            if (File.Exists(ignoreFile))
            {
                Black = MakeBlackList(13452, File.ReadAllText(ignoreFile), lex);
            }

            ParsePlainTextFiles(
                Model,
                SourceFiles = MakeFileList(new string[] { sourcePath },
                                           searchPattern, SearchOption.AllDirectories),
                lex,
                Black);

            Matrix <Word> White = null;

            var file = Path.ChangeExtension(outputFileName, ".allow");

            if (File.Exists(file))
            {
                // White = MakeWhiteList(file, lex, SIZE);
            }

            if (White?.Count > 0 || CBOW.THRESHOLD > 0)
            {
                LimitToThreshold(White, ref Model);
            }

            InitializeAndRandomize(Model);

            return(Model);
        }
示例#12
0
文件: Mikolov.cs 项目: azret/mozart
        public static Word[] RunFullCosineSort(IOrthography lex, Matrix <Word> Model, string Q, int max)
        {
            if (Model == null || string.IsNullOrWhiteSpace(Q))
            {
                Console.ForegroundColor = ConsoleColor.Yellow;
                Console.WriteLine("Model not loaded.\r\n");
                Console.ResetColor();
                Console.WriteLine("See '--load' command for more info...\r\n");
                return(null);
            }
            float[] Re   = new float[CBOW.DIMS];
            float   norm = 0;
            var     sign = +1;

            foreach (var tok in PlainText.ForEach(Q, 0, Q.Length, 0))
            {
                string wi = lex.GetKey(tok.TextFragment.Substring(tok.StartIndex, tok.Length));
                if (wi == "+")
                {
                    sign = +1;
                }
                else if (wi == "-")
                {
                    sign = -1;
                }
                else
                {
                    var vec = Model[wi];
                    if (vec != null)
                    {
                        Debug.Assert(vec.Elements.Length == Re.Length);
                        for (var j = 0; j < Re.Length; j++)
                        {
                            Re[j] += sign * vec.Elements[j].Re;
                        }
                        norm++;
                    }
                    else
                    {
                        Console.ForegroundColor = ConsoleColor.Yellow;
                        Console.WriteLine($"'{wi}' not found.");
                        Console.ResetColor();
                    }
                }
            }
            if (norm > 0)
            {
                for (var j = 0; j < Re.Length; j++)
                {
                    Re[j] /= (float)norm;
                }
            }
            Word[] output = CBOW.Predict(Model, Re, max);
            Array.Sort(output,
                       (a, b) => Dot.CompareTo(a, b));
            Console.WriteLine();
            Console.WriteLine(" [" + string.Join(",", Re.Select(re => Math.Round(re, 4)).Take(7)) + "...]");
            Console.WriteLine();
            int len = 0;

            for (int i = output.Length - 1; i >= 0; i--)
            {
                Word n = output[i];
                if (n != null)
                {
                    string str = n.Id;
                    var    it  = Model[n.Id];
                    if (it != null)
                    {
                        // if (it.Count > 0) {
                        //     var best = it.ArgMax();
                        //     if (best != null) {
                        //         str = best.Id;
                        //     }
                        // }
                    }
                    if (len + str.Length > 37 /* break like if does not fit */)
                    {
                        Console.WriteLine(
                            output.Length <= 31
                                ? $" {str} : {n.ToString(z: true)}"
                                : $" {str}");
                        len = 0;
                    }
                    else
                    {
                        Console.Write(
                            output.Length <= 31
                                ? $" {str} : {n.ToString(z: true)}"
                                : $" {str}");
                        len += str.Length;
                    }
                }
            }
            Console.WriteLine();
            return(output);
        }
示例#13
0
文件: Args.cs 项目: azret/ml
 /// <summary>
 /// Creates an instance of the <see cref="Args"/>
 /// </summary>
 public Args(IOrthography orthography, string searchPath, string searchPattern)
 {
     Orthography   = orthography;
     SearchPath    = searchPath;
     SearchPattern = searchPattern;
 }