static void LearnSpellingVariations(Matrix Model, Set files, IOrthography lex) { Console.Write($"Learning spelling variations...\r\n"); // void ZeroOutVariations() { // foreach (var it in Model) { // it.Clear(); // } // } // ZeroOutVariations(); foreach (string file in (IEnumerable <string>)files) { Console.Write($"Reading {file}...\r\n"); // string textFragment = File.ReadAllText(file); // foreach (var t in PlainText.ForEach(textFragment, 0, textFragment.Length, 0)) { // if (t.Type == PlainTextTag.TAG) { // var s = t.TextFragment.Substring(t.StartIndex, t.Length); // var it = Model[lex.GetKey(s)]; // if (it != null) { // it.Push(s, // out Scalar spellingVariation); // if (s.Length > 1 && char.IsLetter(s[0]) // && char.IsUpper(s[0]) && !char.IsUpper(s[1])) { // /* Give more weight to capitalized words... */ // spellingVariation.Add(2d / CBOW.THRESHOLD); // } else { // spellingVariation.Add(1d / CBOW.THRESHOLD); // } // } // } // } } Console.Write($"\r\nReady!\r\n"); }
static void ParsePlainTextFiles(Matrix Model, Set files, IOrthography lex, Set skipList) { bool IsStopWord(string w) { return(skipList != null ? (skipList[w] != null) : false); } foreach (string file in (IEnumerable <string>)files) { Console.Write($"Reading {file}...\r\n"); string textFragment = File.ReadAllText(file); foreach (var t in PlainText.ForEach(textFragment, 0, textFragment.Length, 0)) { if (t.Type == PlainTextTag.TAG) { var id = lex.GetKey(t.TextFragment.Substring(t.StartIndex, t.Length)); if (!IsStopWord(id)) { var it = Model.Push(id); it.Add(1d / CBOW.THRESHOLD); } } } } }
public WithSubWords(int capacity, int dims, string dir, string searchPattern, SearchOption searchOption, IOrthography orthography) { Model = new System.Ai.Model(capacity, dims); Orthography = orthography; Files = Tools.GetFiles(dir, searchPattern, searchOption ).ToArray(); }
/// <summary> /// Creates an instance of the <see cref="Args"/> /// </summary> private Args(int capacity, int gens, int dims, IOrthography orthography, string searchPath, string searchPattern, SearchOption searchOption) { Capacity = capacity; Gens = gens; Dims = dims; Orthography = orthography; SearchPath = searchPath; SearchPattern = searchPattern; SearchOption = searchOption; }
public ContinuousBagOfWords(IModel model, string seachPath, string searchPattern, SearchOption searchOption, IOrthography orthography, float learningRate, int negatives, int window) { LearningRate = learningRate; Negatives = negatives; Window = window; Model = model; Orthography = orthography; Files = Tools.GetFiles( seachPath, searchPattern, searchOption ).ToArray(); }
static Matrix MakeWhiteList(string file, IOrthography lex, int hashSize) { var W = new Matrix(hashSize); Console.Write($"\r\nReading {file}...\r\n\r\n"); string textFragment = File.ReadAllText(file); foreach (var t in PlainText.ForEach(textFragment, 0, textFragment.Length, 0)) { if (t.Type == PlainTextTag.TAG) { W.Push(lex.GetKey(t.TextFragment.Substring(t.StartIndex, t.Length))); } } return(W); }
static Set MakeStops(int hashSize, string textFragment, IOrthography lex) { var S = new Set(); if (textFragment != null) { foreach (var t in PlainText.ForEach(textFragment, 0, textFragment.Length, 0)) { if (t.Type == PlainTextTag.TAG) { S.Push(lex.GetKey(t.TextFragment.Substring(t.StartIndex, t.Length))); } } } return(S); }
static Matrix BuildFromPlainText(string sourcePath, string searchPattern, IOrthography lex, string outputFileName) { var Model = new Matrix(SIZE); Set SourceFiles = null, SkipList = null; Matrix AllowList = null; var file = Path.ChangeExtension(outputFileName, ".allow"); if (File.Exists(file)) { AllowList = MakeWhiteList(file, lex, SIZE * 3); } file = Path.ChangeExtension(outputFileName, ".ignore"); if (File.Exists(file)) { SkipList = MakeStops(13452, File.ReadAllText(file), lex); } ParsePlainTextFiles( Model, SourceFiles = MakeFiles(new string[] { sourcePath }, searchPattern, SearchOption.AllDirectories), lex, SkipList); if (AllowList.Count > 0 || CBOW.THRESHOLD > 0) { LimitToThreshold(AllowList, ref Model); } InitializeScores(Model); LearnSpellingVariations( Model, SourceFiles, lex); return(Model); }
public Hash CoOccurrences(Hash digrams, IOrthography lang, int window, params string[] paths) { if (window <= 0 || window > 17) { throw new ArgumentOutOfRangeException(); } if (digrams == null) { digrams = Hash.Max(); } Document.Scan(paths, read: (s, emit) => { string k = lang.Hash(s); if (k != null && k.Length > 0) { emit(k); } }, doc: (file, doc) => { for (int i = 0; i < doc.Count; i++) { string w = doc[i]; for (int j = i - ((window + 1) / 2); j < i + ((window + 1) / 2) + 1; j++) { if (j >= 0 && j < doc.Count && i != j) { string c = doc[j]; if (w != c) { if (Gram.Compare(w, c) > 0) { string t = w; w = c; c = t; } string k = (w + " " + c); lock (digrams) { float d = ((float)Math.Abs(i - j)); Gram g = digrams.Get(k); if (g == null) { g = digrams.Put(k); if (g == null) { throw new OutOfMemoryException(); } g.Vector = new float[] { 0f }; } System.Diagnostics.Debug.Assert(g.Vector != null && g.Vector.Length == 1); g.Vector[0] += 0.5f / d; } } } } } ; } ); return(digrams); }
static void TrainMikolovModel(Set sourceFiles, IOrthography lex, Matrix Model, Action <double> SetLoss, Func <bool> HasCtrlBreak) { if (Model == null) { Console.WriteLine("Model not loaded."); return; } Vector[] negDistr = System.Ai.CBOW.CreateNegDistr( Model, SHUFFLE); Thread[] threads = new Thread[Environment.ProcessorCount * 2]; int numberOfThreads = 0, verbOut = 0; for (var t = 0; t < threads.Length; t++) { threads[t] = new Thread(() => { Interlocked.Increment(ref numberOfThreads); try { for (int iter = 0; iter < GENS; iter++) { if (HasCtrlBreak != null && HasCtrlBreak()) { break; } string[] Shuffle = ((IEnumerable <string>)sourceFiles).ToArray(); Random.Shuffle(Shuffle, Shuffle.Length); foreach (string file in Shuffle) { if (HasCtrlBreak != null && HasCtrlBreak()) { return; } try { Console.Write($"\r\nReading {file}...\r\n"); var textFragment = File.ReadAllText(file); string[] slidingWindow = new string[2 * System.Ai.CBOW.WINDOW + 1]; foreach (var q in PlainText.ForEach(textFragment, 0, textFragment.Length, 1 + (slidingWindow.Length >> 1))) { if (HasCtrlBreak != null && HasCtrlBreak()) { return; } var vocab = q.Type == PlainTextTag.TAG ? lex.GetKey(textFragment.Substring( q.StartIndex, q.Length)) : null; for (int i = 0; i < slidingWindow.Length; i++) { if (i == slidingWindow.Length - 1) { slidingWindow[i] = vocab; } else { slidingWindow[i] = slidingWindow[i + 1]; } } SetLoss(System.Ai.CBOW.learnWindow(Model, negDistr, slidingWindow, iter, HasCtrlBreak, ref verbOut)); } Thread.Sleep(3000 + Random.Next(3000)); } finally { } } } } finally { Interlocked.Decrement(ref numberOfThreads); } Console.Write($"[{Thread.CurrentThread.ManagedThreadId}] stopped...\r\n"); }); } foreach (var t in threads) { t.Start(); } foreach (var t in threads) { t.Join(); } Debug.Assert(numberOfThreads == 0); }
static Matrix <Word> BuildFromPlainText(string sourcePath, string searchPattern, IOrthography lex, string outputFileName) { var Model = new Matrix <Word>((id, hashCode) => new Word(id, hashCode), SIZE); Set SourceFiles = null, Black = null; var ignoreFile = Path.ChangeExtension(outputFileName, ".ignore"); if (File.Exists(ignoreFile)) { Black = MakeBlackList(13452, File.ReadAllText(ignoreFile), lex); } ParsePlainTextFiles( Model, SourceFiles = MakeFileList(new string[] { sourcePath }, searchPattern, SearchOption.AllDirectories), lex, Black); Matrix <Word> White = null; var file = Path.ChangeExtension(outputFileName, ".allow"); if (File.Exists(file)) { // White = MakeWhiteList(file, lex, SIZE); } if (White?.Count > 0 || CBOW.THRESHOLD > 0) { LimitToThreshold(White, ref Model); } InitializeAndRandomize(Model); return(Model); }
public static Word[] RunFullCosineSort(IOrthography lex, Matrix <Word> Model, string Q, int max) { if (Model == null || string.IsNullOrWhiteSpace(Q)) { Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine("Model not loaded.\r\n"); Console.ResetColor(); Console.WriteLine("See '--load' command for more info...\r\n"); return(null); } float[] Re = new float[CBOW.DIMS]; float norm = 0; var sign = +1; foreach (var tok in PlainText.ForEach(Q, 0, Q.Length, 0)) { string wi = lex.GetKey(tok.TextFragment.Substring(tok.StartIndex, tok.Length)); if (wi == "+") { sign = +1; } else if (wi == "-") { sign = -1; } else { var vec = Model[wi]; if (vec != null) { Debug.Assert(vec.Elements.Length == Re.Length); for (var j = 0; j < Re.Length; j++) { Re[j] += sign * vec.Elements[j].Re; } norm++; } else { Console.ForegroundColor = ConsoleColor.Yellow; Console.WriteLine($"'{wi}' not found."); Console.ResetColor(); } } } if (norm > 0) { for (var j = 0; j < Re.Length; j++) { Re[j] /= (float)norm; } } Word[] output = CBOW.Predict(Model, Re, max); Array.Sort(output, (a, b) => Dot.CompareTo(a, b)); Console.WriteLine(); Console.WriteLine(" [" + string.Join(",", Re.Select(re => Math.Round(re, 4)).Take(7)) + "...]"); Console.WriteLine(); int len = 0; for (int i = output.Length - 1; i >= 0; i--) { Word n = output[i]; if (n != null) { string str = n.Id; var it = Model[n.Id]; if (it != null) { // if (it.Count > 0) { // var best = it.ArgMax(); // if (best != null) { // str = best.Id; // } // } } if (len + str.Length > 37 /* break like if does not fit */) { Console.WriteLine( output.Length <= 31 ? $" {str} : {n.ToString(z: true)}" : $" {str}"); len = 0; } else { Console.Write( output.Length <= 31 ? $" {str} : {n.ToString(z: true)}" : $" {str}"); len += str.Length; } } } Console.WriteLine(); return(output); }
/// <summary> /// Creates an instance of the <see cref="Args"/> /// </summary> public Args(IOrthography orthography, string searchPath, string searchPattern) { Orthography = orthography; SearchPath = searchPath; SearchPattern = searchPattern; }