public List <string> Split(string s, int maxWordLength, List <FPLtype> fpl, ProbTypeEnum probType, bool verbose) { try { if (verbose) { Trace.Indent(); } if (verbose) { Trace.WriteLine("Getting FPL..."); } if (verbose) { Trace.Write("OK"); } int n = s.Length; List <string> ans = new List <string>(); int[] pre = new int[n + 1], cnt = new int[n + 1]; double[] dp = new double[n + 1]; dp[0] = 0; for (int i = 1; i <= n; i++) { dp[i] = double.NegativeInfinity; } pre[0] = 1;//crutial to make 0-pre[0]<0 when tracing back cnt[0] = 0; if (verbose) { Trace.WriteLine("DPing..."); } int percentage = -1; for (int i = 0; i < n; i++) { Parallel.For(1, Math.Min(n - i, maxWordLength) + 1, (l) => { double wordCount = Count(s.Substring(i, l)); //if (wordCount <= 5)//fpl[l].percent10) //{ // wordCount = Math.Max(wordCount - 5, Math.Pow(((fpl[1].sqrtSum / (motherSA.S.Length / fpl[1].mean))) / motherSA.S.Length, l)); //} double probLog = probType == ProbTypeEnum.CdL ? Math.Log(wordCount / (BaseDataLength() - l + 1)) : probType == ProbTypeEnum.CdM ? Math.Log(wordCount / fpl[l].mean) : probType == ProbTypeEnum.CxLdM ? Math.Log(wordCount / fpl[l].mean) * l : probType == ProbTypeEnum.CmMdSTDE ? (Count(s.Substring(i, l)) - fpl[l].mean) / Math.Pow(fpl[l].stderr, 1.0) : probType == ProbTypeEnum.sqCdS ? Math.Log((double)Math.Sqrt(wordCount) / (fpl[l].sqrtSum / (BaseDataLength() / fpl[l].mean))) : probType == ProbTypeEnum.sqCxLdS ? Math.Log((double)Math.Sqrt(wordCount) / (fpl[l].sqrtSum / (BaseDataLength() / fpl[l].mean))) * l : probType == ProbTypeEnum.lnCdS ? Math.Log(Math.Max(double.Epsilon, (double)Math.Log(wordCount) / (fpl[l].logSum / (BaseDataLength() / fpl[l].mean)))) : probType == ProbTypeEnum.lnCxLdS ? Math.Log(Math.Max(double.Epsilon, (double)Math.Log(wordCount) / (fpl[l].logSum / (BaseDataLength() / fpl[l].mean)))) * l : probType == ProbTypeEnum.Sigmoid ? 1.0 / (1.0 + Math.Exp(-(Count(s.Substring(i, l)) - fpl[l].mean) / fpl[l].stderr)) * l : probType == ProbTypeEnum.Entropy ? Entropy(s.Substring(i, l)) : probType == ProbTypeEnum.Entropy_L ? Entropy(s.Substring(i, l)) * l : probType == ProbTypeEnum.Hank ? Math.Log(Math.Max(CalHank(s.Substring(i, l)), 1e-300)) : throw new Exception($"Unknown probType: {probType}"); // (Count(sa, i, l) - fpl[l].Item1) / Math.Pow(fpl[l].Item2, 1.0); //var v = (dp[i] * cnt[i] + ratio) / (cnt[i] + 1); var v = dp[i] + probLog; if (v > dp[i + l]) { dp[i + l] = v; cnt[i + l] = cnt[i] + 1; pre[i + l] = l; } }); if (i > 0 && (i + 1) * 100L / n > percentage) { if (verbose) { Trace.WriteLine($"DPing... {++percentage}% Ex: {s.Substring(i - pre[i], pre[i])} scored {dp[i]} avg {(double)i / cnt[i]} words"); } } } if (verbose) { Trace.WriteLine("Tracing back..."); } List <int> idxs = new List <int>(); for (int i = n; i >= 0; i -= pre[i]) { idxs.Add(i); } if (verbose) { Trace.WriteLine("Picking words..."); } percentage = -1; for (int i = idxs.Count - 1; i > 0; i--) { string _s = s.Substring(idxs[i], idxs[i - 1] - idxs[i]); WordIdentified?.Invoke(_s); ans.Add(_s); if ((idxs.Count - i + 1) * 100L / idxs.Count > percentage) { if (verbose) { Trace.WriteLine($"Picking words... {++percentage}% Ex: {_s}"); } } } if (verbose) { Trace.Write(" => OK"); } SplittedWords = ans; IsBuilt = true; return(ans); } finally { if (verbose) { Trace.Unindent(); } } }
public async Task <List <string> > SplitAsync(string s, int maxWordLength, ProbTypeEnum probType, bool verbose) { return(await Task.Run(() => Split(s, maxWordLength , probType == ProbTypeEnum.Entropy || probType == ProbTypeEnum.Entropy_L || probType == ProbTypeEnum.Hank ? null : FrequencyPerLength() , probType, verbose))); }