static void ParsePlainTextFiles(Matrix Model, Set files, IOrthography lex, Set skipList) { bool IsStopWord(string w) { return(skipList != null ? (skipList[w] != null) : false); } foreach (string file in (IEnumerable <string>)files) { Console.Write($"Reading {file}...\r\n"); string textFragment = File.ReadAllText(file); foreach (var t in PlainText.ForEach(textFragment, 0, textFragment.Length, 0)) { if (t.Type == PlainTextTag.TAG) { var id = lex.GetKey(t.TextFragment.Substring(t.StartIndex, t.Length)); if (!IsStopWord(id)) { var it = Model.Push(id); it.Add(1d / CBOW.THRESHOLD); } } } } }
public void Build() { Model.Clear(); foreach (var file in Files) { Console.WriteLine($"Reading {Tools.GetShortPath(file)}..."); string buff = File.ReadAllText(file); foreach (var tok in PlainText.ForEach(buff)) { if (tok.Type == PlainTextTag.TEXT) { var id = Orthography.GetKey(buff.Substring( tok.StartIndex, tok.Length)); if (id != null && id.Length > 0) { var y = Model.Push(id); y++; } } } } Console.WriteLine($"Done."); }
static Matrix MakeWhiteList(string file, IOrthography lex, int hashSize) { var W = new Matrix(hashSize); Console.Write($"\r\nReading {file}...\r\n\r\n"); string textFragment = File.ReadAllText(file); foreach (var t in PlainText.ForEach(textFragment, 0, textFragment.Length, 0)) { if (t.Type == PlainTextTag.TAG) { W.Push(lex.GetKey(t.TextFragment.Substring(t.StartIndex, t.Length))); } } return(W); }
static Set MakeStops(int hashSize, string textFragment, IOrthography lex) { var S = new Set(); if (textFragment != null) { foreach (var t in PlainText.ForEach(textFragment, 0, textFragment.Length, 0)) { if (t.Type == PlainTextTag.TAG) { S.Push(lex.GetKey(t.TextFragment.Substring(t.StartIndex, t.Length))); } } } return(S); }
static void TrainMikolovModel(Set sourceFiles, IOrthography lex, Matrix Model, Action <double> SetLoss, Func <bool> HasCtrlBreak) { if (Model == null) { Console.WriteLine("Model not loaded."); return; } Vector[] negDistr = System.Ai.CBOW.CreateNegDistr( Model, SHUFFLE); Thread[] threads = new Thread[Environment.ProcessorCount * 2]; int numberOfThreads = 0, verbOut = 0; for (var t = 0; t < threads.Length; t++) { threads[t] = new Thread(() => { Interlocked.Increment(ref numberOfThreads); try { for (int iter = 0; iter < GENS; iter++) { if (HasCtrlBreak != null && HasCtrlBreak()) { break; } string[] Shuffle = ((IEnumerable <string>)sourceFiles).ToArray(); Random.Shuffle(Shuffle, Shuffle.Length); foreach (string file in Shuffle) { if (HasCtrlBreak != null && HasCtrlBreak()) { return; } try { Console.Write($"\r\nReading {file}...\r\n"); var textFragment = File.ReadAllText(file); string[] slidingWindow = new string[2 * System.Ai.CBOW.WINDOW + 1]; foreach (var q in PlainText.ForEach(textFragment, 0, textFragment.Length, 1 + (slidingWindow.Length >> 1))) { if (HasCtrlBreak != null && HasCtrlBreak()) { return; } var vocab = q.Type == PlainTextTag.TAG ? lex.GetKey(textFragment.Substring( q.StartIndex, q.Length)) : null; for (int i = 0; i < slidingWindow.Length; i++) { if (i == slidingWindow.Length - 1) { slidingWindow[i] = vocab; } else { slidingWindow[i] = slidingWindow[i + 1]; } } SetLoss(System.Ai.CBOW.learnWindow(Model, negDistr, slidingWindow, iter, HasCtrlBreak, ref verbOut)); } Thread.Sleep(3000 + Random.Next(3000)); } finally { } } } } finally { Interlocked.Decrement(ref numberOfThreads); } Console.Write($"[{Thread.CurrentThread.ManagedThreadId}] stopped...\r\n"); }); } foreach (var t in threads) { t.Start(); } foreach (var t in threads) { t.Join(); } Debug.Assert(numberOfThreads == 0); }
public static Word[] RunFullCosineSort(IOrthography lex, Matrix <Word> Model, string Q, int max) { if (Model == null || string.IsNullOrWhiteSpace(Q)) { Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("Model not loaded.\r\n"); Console.ResetColor(); Console.WriteLine("See '--load' command for more info...\r\n"); return(null); } float[] Re = new float[CBOW.DIMS]; float norm = 0; var sign = +1; foreach (var tok in PlainText.ForEach(Q, 0, Q.Length, 0)) { string wi = lex.GetKey(tok.TextFragment.Substring(tok.StartIndex, tok.Length)); if (wi == "+") { sign = +1; } else if (wi == "-") { sign = -1; } else { var vec = Model[wi]; if (vec != null) { Debug.Assert(vec.Elements.Length == Re.Length); for (var j = 0; j < Re.Length; j++) { Re[j] += sign * vec.Elements[j].Re; } norm++; } else { Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine($"'{wi}' not found."); Console.ResetColor(); } } } if (norm > 0) { for (var j = 0; j < Re.Length; j++) { Re[j] /= (float)norm; } } Word[] output = CBOW.Predict(Model, Re, max); Array.Sort(output, (a, b) => Dot.CompareTo(a, b)); Console.WriteLine(); Console.WriteLine(" [" + string.Join(",", Re.Select(re => Math.Round(re, 4)).Take(7)) + "...]"); Console.WriteLine(); int len = 0; for (int i = output.Length - 1; i >= 0; i--) { Word n = output[i]; if (n != null) { string str = n.Id; var it = Model[n.Id]; if (it != null) { // if (it.Count > 0) { // var best = it.ArgMax(); // if (best != null) { // str = best.Id; // } // } } if (len + str.Length > 37 /* break like if does not fit */) { Console.WriteLine( output.Length <= 31 ? $" {str} : {n.ToString(z: true)}" : $" {str}"); len = 0; } else { Console.Write( output.Length <= 31 ? $" {str} : {n.ToString(z: true)}" : $" {str}"); len += str.Length; } } } Console.WriteLine(); return(output); }