void Train(Hash model, Gram[] shuffle, int VECTOR) { void Prepare() { Random r = new Random(); for (int i = 0; i < shuffle.Length; i++) { string[] window = shuffle[i].Key.Split(); Gram w = model.Get(window[0]); if (w == null) { w = model.Put(window[0]); w.Vector = new float[VECTOR * 2]; for (var j = 0; j < w.Vector.Length; j++) { w.Vector[j] = (float)r.NextDouble() - 0.5f; } } w.Norm = 0; Gram c = model.Get(window[1]); if (c == null) { c = model.Put(window[1]); c.Vector = new float[VECTOR * 2]; for (var j = 0; j < w.Vector.Length; j++) { c.Vector[j] = (float)r.NextDouble() - 0.5f; } } c.Norm = 0; } } Prepare(); float sgd(Gram w, Gram c, float Pwc) { float dot(float[] Vw, float[] Vc) { System.Diagnostics.Debug.Assert(Vw.Length == 2 * VECTOR); System.Diagnostics.Debug.Assert(Vc.Length == 2 * VECTOR); var y = 0f; for (int k = 0; k < VECTOR; k++) { y += Vw[k] * Vc[k + VECTOR]; } return(y); } float f(float x) { float y; float Xmax = 100f; if (x < Xmax) { y = (float)Math.Pow(x / Xmax, 0.75); } else { y = 1; } return(y); } float J(float x) { return(dot(w.Vector, c.Vector) - (float)Math.Log((double)x)); } float ʝ = J(Pwc), ƒ = f(Pwc); if (float.IsNaN(ƒ)) { System.Diagnostics.Debugger.Break(); } const float α = 0.05f; for (int k = 0; k < VECTOR; k++) { const float μ = 0.09f; float δJw = ƒ * ʝ * c.Vector[k + VECTOR]; float δJc = ƒ * ʝ * w.Vector[k]; w.Vector[k] -= α * δJw; c.Vector[k + VECTOR] -= α * δJc; } return(0.5f * ƒ * (ʝ * ʝ)); } for (int iter = 0; iter < 113 * 113; iter++) { if (canceled) { Console.WriteLine($"Stopping... [{Thread.CurrentThread.ManagedThreadId}]"); break; } float E = 0f; int count = 0; Shuffle(shuffle); System.Threading.Tasks.Parallel.ForEach(shuffle, new System.Threading.Tasks.ParallelOptions() { }, (co, state) => { if (canceled) { Console.WriteLine($"Stopping... [{Thread.CurrentThread.ManagedThreadId}]"); state.Stop(); return; } string[] window = co.Key.Split(); (Gram w, Gram c) = (model.Get(window[0]), model.Get(window[1])); float e; E += e = sgd(w, c, co.Vector[0]); int n = Interlocked.Increment(ref count); E += e = sgd(c, w, co.Vector[0]); n = Interlocked.Increment(ref count); //if (n % 75703 == 0) { // Console.WriteLine($"{iter:n0} [{n:n0}] : {E / n} Vw('{w.Key}') * Vc('{c.Key}') = {co.Vector[0]} ~ {e}"); //} });
public static Hash Load(string file, Func <string, float[], bool> take) { Hash table = Hash.Max(); using (var stream = File.OpenText(file)) { Line h = new Line(stream.ReadLine()); int magic; if (!int.TryParse(h.ReadInt(), out magic) || magic != 6053) { throw new InvalidDataException(); } h.SkipWhite(); int total; if (!int.TryParse(h.ReadInt(), out total)) { throw new InvalidDataException(); } h.SkipWhite(); int dim; if (!int.TryParse(h.ReadInt(), out dim)) { throw new InvalidDataException(); } int read = 0; float[] buff = new float[0]; for (; ;) { Line r = new Line(stream.ReadLine()); if (r.IsEof) { break; } string key = r.ReadKey(); if (key == null || key.Length == 0) { throw new InvalidDataException(); } float[] vector = null; r.SkipWhite(); if (!r.IsEof && r.Char == '⇾') { r.Skip(); int j = 0; r.SkipWhite(); if (r.IsEof || r.Char != '[') { throw new InvalidDataException(); } r.Skip(); while (!r.IsEof && r.Char != ']') { r.SkipWhite(); float n; string f = r.ReadFloat(); if (!float.TryParse(f, out n)) { throw new InvalidDataException(); } if (buff.Length < j + 1) { Array.Resize(ref buff, j + 1); } buff[j++] = n; } if (r.IsEof || r.Char != ']') { throw new InvalidDataException(); } if (j > 0) { vector = new float[j]; Array.Copy(buff, vector, j); } } if (take != null) { if (!take(key, vector)) { key = null; } } if (key != null) { Gram g = table.Put(key); if (g == null) { throw new OutOfMemoryException(); } g.Vector = vector; } read++; } if (total != read) { throw new InvalidDataException(); } } return(table); }
public Hash CoOccurrences(Hash digrams, IOrthography lang, int window, params string[] paths) { if (window <= 0 || window > 17) { throw new ArgumentOutOfRangeException(); } if (digrams == null) { digrams = Hash.Max(); } Document.Scan(paths, read: (s, emit) => { string k = lang.Hash(s); if (k != null && k.Length > 0) { emit(k); } }, doc: (file, doc) => { for (int i = 0; i < doc.Count; i++) { string w = doc[i]; for (int j = i - ((window + 1) / 2); j < i + ((window + 1) / 2) + 1; j++) { if (j >= 0 && j < doc.Count && i != j) { string c = doc[j]; if (w != c) { if (Gram.Compare(w, c) > 0) { string t = w; w = c; c = t; } string k = (w + " " + c); lock (digrams) { float d = ((float)Math.Abs(i - j)); Gram g = digrams.Get(k); if (g == null) { g = digrams.Put(k); if (g == null) { throw new OutOfMemoryException(); } g.Vector = new float[] { 0f }; } System.Diagnostics.Debug.Assert(g.Vector != null && g.Vector.Length == 1); g.Vector[0] += 0.5f / d; } } } } } ; } ); return(digrams); }