public LanguageIdentifier(string distroFile, int cap) { using (var br = new BinaryReader(new GZipStream(new FileStream(distroFile, FileMode.Open, FileAccess.Read), CompressionMode.Decompress))) { this.tokenizer = Tokenization.Tokenizer(br.ReadString()); this.lo = br.ReadInt32(); this.hi = br.ReadInt32(); this.tlc = br.ReadBoolean(); var numLangs = br.ReadInt32(); this.langs = new string[numLangs]; this.distros = new Dictionary <string, double> [numLangs]; this.numOccs = new long[numLangs]; for (int i = 0; i < numLangs; i++) { var distro = new Dictionary <string, double>(); this.langs[i] = br.ReadString(); var numGrams = br.ReadInt64(); var maxRank = (cap == -1 || numGrams < cap) ? numGrams : cap; var arr = new GramFreq[maxRank]; for (int j = 0; j < numGrams; j++) { var gram = br.ReadString(); var occs = br.ReadInt64(); if (j < maxRank) { this.numOccs[i] += occs; arr[j] = new GramFreq { gram = gram, freq = (double)occs }; } } this.distros[i] = PopulateDistro(arr); } } }
override protected void FillInSims(LangSim[] res, string s) { var docFreqs = new Dictionary <string, long>(); foreach (var tok in this.tokenizer(s, this.tlc, this.lo, this.hi)) { if (!docFreqs.ContainsKey(tok)) { docFreqs[tok] = 0; } docFreqs[tok]++; } var keys = docFreqs.Keys.ToArray(); var arr = new GramFreq[keys.Length]; for (int i = 0; i < keys.Length; i++) { var token = keys[i]; arr[i] = new GramFreq { gram = token, freq = docFreqs[token] }; } Array.Sort <GramFreq>(arr); var docMaxRank = (cap == -1 || arr.Length < cap) ? arr.Length : cap; for (int j = 0; j < res.Length; j++) { res[j].sim = this.rankSim(arr, docMaxRank, this.distros[j], this.maxPos); } }