private static int CountInsideSA(SuffixArray sa, int startIndex, int length) { //var s = sa.S.Substring(startIndex, length); //return sa.UpperBound(s) - sa.LowerBound(s); int l, r, n = sa.S.Length; l = r = sa.RANK[startIndex]; int m = 1; for (; l - m >= 0 && string.Compare(sa.S, sa.SA[l], sa.S, sa.SA[l - m], length, StringComparison.Ordinal) == 0; m <<= 1) { l -= m; } for (; m > 0; m >>= 1) { if (l - m >= 0 && string.Compare(sa.S, sa.SA[l], sa.S, sa.SA[l - m], length, StringComparison.Ordinal) == 0) { l -= m; } } m = 1; for (; r + m < n && string.Compare(sa.S, sa.SA[r], sa.S, sa.SA[r + m], length, StringComparison.Ordinal) == 0; m <<= 1) { r += m; } for (; m > 0; m >>= 1) { if (r + m < n && string.Compare(sa.S, sa.SA[r], sa.S, sa.SA[r + m], length, StringComparison.Ordinal) == 0) { r += m; } } return(r - l + 1); }
public SentenceSplitter(SuffixArray sa) { FrequencyPerLength = () => MethodsForSuffixArray.FrequencyPerLength(sa); Count = s => MethodsForSuffixArray.Count(sa, s); BaseDataLength = () => sa.S.Length; NextChars = delegate { throw new NotImplementedException(); }; }
public SATabPage() : base("SA") { //InitializeComponent(); TLPmain.Controls.Add(TLPtop, 0, 0); { TLPtop.Controls.Add(TXBin, 0, 0); TLPtop.Controls.Add(new MyPanel() { Controls = { TLPctrl }, AutoScroll = true, Dock = DockStyle.Top }, 1, 0); { //TLPctrl.SetRowSpan(TXBin, TLPctrl.RowCount); int row = 0; TLPctrl.Controls.Add(CBmethod, 0, row++); { CBmethod.Items.Add("Count Word"); CBmethod.Items.Add("List Words"); CBmethod.Items.Add("Send Socket"); CBmethod.Items.Add("Cut by Code"); } TLPctrl.Controls.Add(BTNexportSA, 0, row++); TLPctrl.Controls.Add(BTNsave, 0, row++); TLPctrl.Controls.Add(BTNload, 0, row++); TLPctrl.Controls.Add(BTNexportList, 0, row++); TLPctrl.Controls.Add(BTNnew, 0, row++); TLPctrl.Controls.Add(CHBdebugMode, 0, row++); TLPctrl.Controls.Add(CHBreplaceWithEmptyExceptChinese, 0, row++); TLPctrl.Controls.Add(CHBremoveEmpty, 0, row++); TLPctrl.Controls.Add(CHBverbose, 0, row++); TLPctrl.Controls.Add(IFdata, 0, row++); { IFdata.AddField("maxWordLength", maxWordLength.ToString()); } TLPctrl.Controls.Add(CBprobType, 0, row++); { foreach (var s in SentenceSplitter.probTypeString.Split('\n')) { CBprobType.Items.Add(s); } CBprobType.SelectedValueChanged += (sender, e) => { probType = Enum.GetValues(typeof(SentenceSplitter.ProbTypeEnum)).Cast <SentenceSplitter.ProbTypeEnum>().FirstOrDefault(v => CBprobType.Text.IndexOf($"probType == ProbTypeEnum.{v}") != -1); //MessageBox.Show(probType.ToString()); }; } TLPctrl.Controls.Add(CHBsplit, 0, row++); } } TLPmain.Controls.Add(TXBout, 0, 1); TLPmain.Controls.Add(TXBdata, 0, 2); //TXBdata.TextChanged += TXBdata_TextChanged; TXBdata.MouseDoubleClick += TXBdata_MouseDoubleClick; TXBin.TextChanged += TXBin_TextChanged; TXBin.KeyDown += TXBin_KeyDown; TXBin.ContextMenu = new ContextMenu(new[] { new MenuItem("sample code", delegate { TXBin.Text = sampleCode; }) }); BTNexportSA.Click += BTNexportSA_Click; BTNexportList.Click += BTNexportList_Click; CHBsplit.CheckedChanged += CHBsplit_CheckedChanged; BTNsave.Click += BTNsave_Click; BTNload.Click += BTNload_Click; BTNnew.Click += BTNnew_Click; this.Controls.Add(TLPmain); //sam = new SAM(); //sam.StatusChanged += (s) => { this.Invoke(new Action(() => this.Text = $"[*] {s}")); }; //sm = new SimpleMethod(); //sm.StatusChanged += (s) => { this.Invoke(new Action(() => this.Text = $"[*] {s}")); }; //sa.StatusChanged += (s) => { this.Invoke(new Action(() => this.Text = $"[*] {s}")); }; sa = new SuffixArray(); ss = new SentenceSplitter(sa); StartServices(); }
public static List <FPLtype> FrequencyPerLength(SuffixArray sa) { Trace.WriteLine("FrequencyPerLength(SuffixArray sa)..."); try { Trace.Indent(); if (sa.FPL != null) { return(sa.FPL); } int n = sa.S.Length; Trace.WriteLine("Copying height data..."); List <Tuple <int, int> > h = new List <Tuple <int, int> >(); for (int i = 1; i < n; i++) { h.Add(new Tuple <int, int>(sa.HEIGHT[i], i)); } Trace.WriteLine("Sorting..."); //DistributedSort(h, (a, b) => a.Item1.CompareTo(b.Item1)); h.Sort((a, b) => a.Item1.CompareTo(b.Item1)); List <Tuple <int, int> > changes = new List <Tuple <int, int> >(); { Trace.WriteLine("Creating linked list..."); int[] linkl = new int[n + 1], linkr = new int[n + 1]; for (int i = 0; i < n; i++) { linkl[i + 1] = i; linkr[i] = i + 1; } Trace.WriteLine("Simulate changes..."); int j = n - 2; for (int gram = n; gram >= 1; gram--) { while (j >= 0 && h[j].Item1 >= gram) { int k = h[j].Item2; int l = linkl[k], r = linkr[k]; changes.Add(new Tuple <int, int>(k - l, r - k)); linkl[r] = l; linkr[l] = r; --j;//j+1 is the num of splittings } changes.Add(new Tuple <int, int>(-1, j)); } } Trace.WriteLine("Building ans..."); List <FPLtype> ans = new List <FPLtype>(); ans.Resize(n + 1, default(FPLtype)); //for (int i = 1; i <= n; i++) ans[i] = new FPLtype(); Trace.WriteLine("Filling ans..."); { long sp2 = n; //sum of power 2 double sp0_5 = n; //sum of power 0.5 double slog = n * Math.Log(2); int[] cnt = new int[n + 1]; for (int i = 0; i <= n; i++) { Trace.Assert(cnt[i] == 0); } int cursor = 1, current_count = cnt[1] = n; //System.Windows.Forms.MessageBox.Show("pass"); for (int gram = n, c = 0; gram >= 1; gram--) { { int cc = c; while (true) { var p = changes[cc++]; if (p.Item1 == -1) { break; } sp2 -= Sq(p.Item1); sp2 -= Sq(p.Item2); sp2 += Sq(p.Item1 + p.Item2); sp0_5 -= Math.Sqrt(p.Item1); sp0_5 -= Math.Sqrt(p.Item2); sp0_5 += Math.Sqrt(p.Item1 + p.Item2); slog -= Math.Log(p.Item1 + 1); slog -= Math.Log(p.Item2 + 1); slog += Math.Log(p.Item1 + p.Item2 + 1); --cnt[p.Item1]; if (p.Item1 <= cursor) { --current_count; } --cnt[p.Item2]; if (p.Item2 <= cursor) { --current_count; } ++cnt[p.Item1 + p.Item2]; if (p.Item1 + p.Item2 <= cursor) { ++current_count; } //Trace.WriteLine($"{p.Item1}\t{p.Item2}"); //Trace.Assert(cnt[p.Item1] >= 0 && cnt[p.Item2] >= 0); } } while (changes[c++].Item1 != -1) { ; } int j = changes[c - 1].Item2; try { while (current_count < 0.01 * (j + 2)) { current_count += cnt[++cursor]; } while (current_count - cnt[cursor] >= 0.01 * (j + 2)) { current_count -= cnt[cursor--]; } } catch (Exception error) { System.Windows.Forms.MessageBox.Show($"cursor={cursor}, current_count={current_count}, i={gram}, j={j}, n={n}\r\n" + error.ToString()); } double u = (double)n / (j + 2); ans[gram] = new FPLtype { uniqCnt = j + 2, mean = u, stderr = Math.Sqrt((double)sp2 / (j + 2) - u * u), sqrtSum = sp0_5, logSum = slog, percent10 = cursor }; } } Trace.Write("OK"); //System.Windows.Forms.MessageBox.Show(string.Join(", ", ans.GetRange(0, 20))); return(sa.FPL = ans); } catch (Exception error) { System.Windows.Forms.MessageBox.Show(error.ToString()); throw; } finally { Trace.Unindent(); } }
public static int Count(SuffixArray sa, string s) { return(sa.UpperBound(s) - sa.LowerBound(s)); }