public void TestCount() { MultiSet<string> empty = new MultiSet<string>(); Assert.AreEqual<int>(0, empty.Count); empty.Add("foo"); empty.Add("bar"); Assert.AreEqual<int>(2, empty.Count); empty.Add("foo"); empty.Add("foo"); Assert.AreEqual<int>(4, empty.Count); }
public Potion(Aspects.Primary primary1, Aspects.Primary primary2, Aspects.Primary primary3, MultiSet<Aspects.Secondary> secondaries, PotionSlot slot, Aspects.Secondary type, Effect effect) { primaries = new MultiSet<Aspects.Primary>(); primaries.Add(primary1); primaries.Add(primary2); primaries.Add(primary3); this.secondaries = new MultiSet<Aspects.Secondary>(secondaries); _slot = slot; _type = type; _effect = effect; }
public void add(Aspects.Primary primary1, Aspects.Primary primary2, Aspects.Primary primary3, MultiSet<Aspects.Secondary> secondaries, PotionSlot slot, Aspects.Secondary type, Effect effect) { Potion potion = new Potion(primary1, primary2, primary3, secondaries, slot, type, effect); MultiSet<Aspects.Primary> primaries = new MultiSet<Aspects.Primary>(); primaries.Add(primary1); primaries.Add(primary2); primaries.Add(primary3); if (!potions.ContainsKey(primaries)) { potions.Add(primaries, new List<Potion>()); } potions[primaries].Add(potion); }
public static string GetCharRange(string text) { MultiSet<string> counter = new MultiSet<string>(); foreach (int ch in text) { if (ch >= mCharRanges.Count) { counter.Add("Other"); continue; } counter.Add(mCharRanges[ch]); } int max = 0; string charRange = "Other"; foreach (KeyValuePair<string, int> item in counter) { if (item.Value > max) { max = item.Value; charRange = item.Key; } } return charRange; }
public Potion createPotion(IngredientType ingredient1, IngredientType ingredient2, IngredientType ingredient3) { IngredientData data1 = Ingredients.instance().getIngredient(ingredient1); IngredientData data2 = Ingredients.instance().getIngredient(ingredient2); IngredientData data3 = Ingredients.instance().getIngredient(ingredient3); MultiSet<Aspects.Secondary> secondaries = new MultiSet<Aspects.Secondary>(); secondaries.Add(data1.secondary); secondaries.Add(data2.secondary); secondaries.Add(data3.secondary); Potion createdPotion = getBestMatch(data1.primary, data2.primary, data3.primary, secondaries); logPotionCreation(createdPotion, ingredient1, ingredient2, ingredient3); if (!hasDoneAutoPause && createdPotion != defaultPotion) { hasDoneAutoPause = true; GameObject.FindObjectOfType<PauseMenuController>().pause(); } return createdPotion; }
public static void GetVocabularyRichness(Text text, out double ttr, out double hl, out double honore, out double brunet, bool lemmas) { // type-token ratio (TTR) MultiSet <string> tokens = new MultiSet <string>(); int n = 0; foreach (Sentence sentence in text.mSentences) { foreach (Token token in sentence.mTokens) { if (!token.mIsPunctuation) { if (lemmas) { tokens.Add(token.mLemma.ToLower()); } else { tokens.Add(token.mTokenStr.ToLower()); } n++; } } } int v = tokens.CountUnique; ttr = (double)v / (double)n; // hapax legomena int v1 = tokens.ToList().Count(x => x.Key == 1); hl = (double)v1 / (double)n; // Honore's statistic: R = 100 x log(N) / (1 - V1 / V) honore = 100.0 * Math.Log(n) / (1.0 - (double)v1 / (double)v); // Brunet's index: W = N^(V^-0.165) brunet = Math.Pow(n, Math.Pow(v, -0.165)); }
public void Remove() { IMultiSet <string> multiSet = new MultiSet <string>(); multiSet.Add("itemToDecrease", 3); multiSet.Add("itemToDelete", 3); multiSet.Add("itemToDelete_negative", 3); multiSet.Add("itemNotChanged", 3); Assert.AreEqual(3, multiSet.Remove("itemToDecrease", 1)); Assert.IsTrue(multiSet.Remove("itemToDecrease")); Assert.AreEqual(3, multiSet.Remove("itemToDelete", 3)); Assert.IsFalse(multiSet.Remove("itemToDelete")); Assert.AreEqual(3, multiSet.Remove("itemToDelete_negative", 4)); Assert.AreEqual(3, multiSet.Remove("itemNotChanged", 0)); Assert.AreEqual(0, multiSet.Remove("itemNotExist", 1)); Assert.IsFalse(multiSet.Remove("itemNotExist")); Assert.AreEqual(1, multiSet.Count(i => i == "itemToDecrease")); Assert.AreEqual(3, multiSet.Count(i => i == "itemNotChanged")); Assert.IsFalse(multiSet.Any(i => i == "itemToDelete")); Assert.IsFalse(multiSet.Any(i => i == "itemToDelete_negative")); Assert.IsFalse(multiSet.Any(i => i == "itemNotExist")); }
public void SetItemCount_ExpectedCountMisMatch_DoesNotUpdate() { IMultiSet <string> multiSet = new MultiSet <string>(); multiSet.Add("item", 3); Assert.IsFalse(multiSet.SetItemCount("item", 4, 5)); Assert.IsFalse(multiSet.SetItemCount("item", 0, 5)); Assert.IsFalse(multiSet.SetItemCount("itemNotExist", 1, 5)); Assert.IsFalse(multiSet.SetItemCount("itemNotExist", 5, 5)); Assert.AreEqual(3, multiSet.Count(i => i == "item")); Assert.IsFalse(multiSet.Any(i => i == "itemNotExist")); }
public static string GetCharRange(string text) { MultiSet <string> counter = new MultiSet <string>(); foreach (int ch in text) { if (ch >= mCharRanges.Count) { counter.Add("Other"); continue; } counter.Add(mCharRanges[ch]); } int max = 0; string charRange = "Other"; foreach (KeyValuePair <string, int> item in counter) { if (item.Value > max) { max = item.Value; charRange = item.Key; } } return(charRange); }
public void Train(ILabeledExampleCollection <LblT, ExT> dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null); MultiSet <LblT> counter = new MultiSet <LblT>(mLblCmp); foreach (LabeledExample <LblT, ExT> lblEx in dataset) { counter.Add(lblEx.Label); } mPrediction = new Prediction <LblT>(); foreach (KeyValuePair <LblT, int> keyVal in counter) { mPrediction.Inner.Add(new KeyDat <double, LblT>((double)keyVal.Value / (double)dataset.Count, keyVal.Key)); } mPrediction.Inner.Sort(DescSort <KeyDat <double, LblT> > .Instance); }
public void Train(ILabeledExampleCollection <LblT, SparseVector <double> > dataset) { Utils.ThrowException(dataset == null ? new ArgumentNullException("dataset") : null); Utils.ThrowException(dataset.Count == 0 ? new ArgumentValueException("dataset") : null); Dispose(); int[] trainSet = new int[dataset.Count]; int[] labels = new int[dataset.Count]; Dictionary <LblT, int> lblToIdx = new Dictionary <LblT, int>(mLblCmp); MultiSet <int> lblCount = new MultiSet <int>(); int j = 0; foreach (LabeledExample <LblT, SparseVector <double> > lblEx in dataset) { SparseVector <double> vec = lblEx.Example; int[] idx = new int[vec.Count]; float[] val = new float[vec.Count]; for (int i = 0; i < vec.Count; i++) { idx[i] = vec.InnerIdx[i] + 1; val[i] = (float)vec.InnerDat[i]; // *** cast to float } int lbl; if (!lblToIdx.TryGetValue(lblEx.Label, out lbl)) { lblToIdx.Add(lblEx.Label, lbl = lblToIdx.Count); mIdxToLbl.Add(lblEx.Label); } Utils.ThrowException(lbl == 2 ? new ArgumentValueException("dataset") : null); trainSet[j++] = SvmLightLib.NewFeatureVector(idx.Length, idx, val, lbl == 0 ? 1 : -1); lblCount.Add(lbl == 0 ? 1 : -1); } string costFactor = ""; if (mBiasedCostFunction) { costFactor = "-j " + ((double)lblCount.GetCount(-1) / (double)lblCount.GetCount(1)); } mModelId = SvmLightLib.TrainModel(string.Format(CultureInfo.InvariantCulture, "-v {0} -c {1} -t {2} -g {3} -d {4} -s {5} -r {6} -b {7} -e {8} -# {9} {10} {11}", (int)mVerbosityLevel, mC, (int)mKernelType, mKernelParamGamma, mKernelParamD, mKernelParamS, mKernelParamC, mBiasedHyperplane ? 1 : 0, mEps, mMaxIter, mCustomParams, costFactor), trainSet.Length, trainSet); // delete training vectors foreach (int vecIdx in trainSet) { SvmLightLib.DeleteFeatureVector(vecIdx); } }
public void AddSBTest() { StringBuilder sb = new StringBuilder("aaa"); StringBuilder sb1 = new StringBuilder("bbb"); StringBuilder sb2 = new StringBuilder("ccc"); List <StringBuilder> list = new List <StringBuilder>() { sb, sb1, sb2 }; MultiSet <StringBuilder> ms = new MultiSet <StringBuilder>(); foreach (var s in list) { ms.Add(s); } string output = "aaa, bbb, ccc"; Assert.AreEqual(output, ms.ToString()); Assert.AreEqual(3, ms.Count); }
private Potion getBestMatch(Aspects.Primary primary1, Aspects.Primary primary2, Aspects.Primary primary3, MultiSet<Aspects.Secondary> secondaries) { MultiSet<Aspects.Primary> primaries = new MultiSet<Aspects.Primary>(); primaries.Add(primary1); primaries.Add(primary2); primaries.Add(primary3); if (!potions.ContainsKey(primaries)) { return defaultPotion; } List<Potion> primaryMatches = potions[primaries]; Potion bestMatch = null; foreach (Potion primaryMatch in primaryMatches) { bool match = (primaryMatch.getSecondaries().Except(secondaries).Count == 0); if (match && (bestMatch == null || bestMatch.getSecondaries().Count() < primaryMatch.getSecondaries().Count())) { bestMatch = primaryMatch; } } return bestMatch; }
private void PrecomputeProbabilities(ILabeledExampleCollection <LblT, BinaryVector> dataset) { mFeaturePriors = new Dictionary <int, double>(); ArrayList <LblT> tmp = new ArrayList <LblT>(); Dictionary <LblT, int> lblToIdx = new Dictionary <LblT, int>(mLblCmp); foreach (LabeledExample <LblT, BinaryVector> labeledExample in dataset) { if (!lblToIdx.ContainsKey(labeledExample.Label)) { lblToIdx.Add(labeledExample.Label, lblToIdx.Count); tmp.Add(labeledExample.Label); } } // prepare counters mExampleCount = new int[tmp.Count]; mFeatureProb = new Dictionary <int, double> [tmp.Count]; for (int j = 0; j < mFeatureProb.Length; j++) { mFeatureProb[j] = new Dictionary <int, double>(); } MultiSet <int> featureCounter = new MultiSet <int>(); // count features int i = 0; foreach (LabeledExample <LblT, BinaryVector> labeledExample in dataset) { mLogger.ProgressFast(Logger.Level.Info, /*sender=*/ this, "PrecomputeProbabilities", "Processing example {0} / {1}", ++i, dataset.Count); int lblIdx = lblToIdx[labeledExample.Label]; mExampleCount[lblIdx]++; double val; foreach (int idx in labeledExample.Example) { featureCounter.Add(idx); if (mFeatureProb[lblIdx].TryGetValue(idx, out val)) { mFeatureProb[lblIdx][idx] = val + 1; } else { mFeatureProb[lblIdx].Add(idx, 1); } } } // estimate probabilities i = 0; foreach (Dictionary <int, double> probVec in mFeatureProb) { foreach (int featIdx in new ArrayList <int>(probVec.Keys)) { double p0 = ((double)featureCounter.GetCount(featIdx) + 1.0) / ((double)dataset.Count + 2.0); // rule of succession (feature prior) double p = (probVec[featIdx] + 2.0 * p0) / ((double)mExampleCount[i] + 2.0); // m-estimate (m = 2) probVec[featIdx] = p; if (!mFeaturePriors.ContainsKey(featIdx)) { mFeaturePriors.Add(featIdx, p0); } } i++; } mIdxToLbl = tmp.ToArray(); }
public void AddSegment(Vector2 _left, Vector2 _right) { m_segements.Add(_left, _right); }
public void TestEquals() { MultiSet<String> requirements = new MultiSet<string>(); requirements.Add("DAIRY"); requirements.Add("PLANT"); requirements.Add("PLANT"); MultiSet<String> available = new MultiSet<string>(); available.Add("PLANT"); available.Add("PLANT"); available.Add("DAIRY"); Assert.AreEqual(requirements, available); Assert.AreEqual<int>(requirements.GetHashCode(), available.GetHashCode()); }
static void Main(string[] args) { Console.WriteLine(TimePoint.FromTicks(16087186800000000).ToString()); return; #if true var h = new HashSet <TimePoint>(); var d = new Dictionary <TimePoint, int>(); h.Add(new TimePoint(2)); h.Add(new TimePoint(2)); d.Add(new TimePoint(302342342341), 2); d.Add(new TimePoint(302342342340), 3); foreach (var i in d) { Console.WriteLine(i.Key.ToString() + i.Value); } #elif true // Json var jsonObjectCollection = (JsonDataObject)JsonParser.Parse("{\"a\" : FalSe}"); Console.WriteLine(jsonObjectCollection["a"].GetBool()); //foreach (var i in jsonObjectCollection) //{ // var a = (JsonDataArray)i; // foreach (var jj in a) // { // var n = (JsonDataNumber)jj; // var Number = n.GetUInt32(); // Console.WriteLine(Number); // } //} return; string JsonStr = @" [3], [4] "; var j = JsonParser.Parse(JsonStr); Console.WriteLine(j.ToString()); // string JsonStr = @"{ // """" : """", // ""boolName"" : true, // ""boolName2"" : True, // ""boolName3"" : false, // ""boolName4"" : False, // ""name"" : ""string: ,@' t value"", // ""name2"":12.34 , // ""name3"" :5667, // ""objname"" :{}, // ""array"": [123,2234,""ok"",false,true,{},[]] //}"; // var j = JsonParser.Parse(JsonStr); // 중첩 컨테이너 가능할 때까지 아래 테스트 불가. //var pa = new SPoint[2]; //for (int i = 0; i < pa.Length; ++i) // pa[i] = new SPoint(1, 2); //j.Push("PointArray", pa); //var sa = new string[2]; //for (int i = 0; i < sa.Length; ++i) // sa[i] = i.ToString(); //j.Push("StringArray", sa); //var sl = new List<string>(); //for (int i = 0; i < 2; ++i) // sl.Add(i.ToString()); //j.Push("StringList", sl); //var hs = new HashSet<string>(); //for (int i = 0; i < 2; ++i) // hs.Add(i.ToString()); //j.Push("StringHashSet", hs); //var mss = new MultiSet<string>(); //for (int i = 0; i < 2; ++i) // mss.Add(i.ToString()); //j.Push("StringMultiSet", mss); //var pd = new Dictionary<string, SPoint>(); //for (int i = 0; i < 2; ++i) // pd.Add(i.ToString(), new SPoint(1, 1)); //j.Push("PointDictionary", pd); //var pmm = new CMultiMap<string, SPoint>(); //for (int i = 0; i < 2; ++i) // pmm.Add(i.ToString(), new SPoint(i, i)); //j.Push("PointMultiMap", pmm); //var l = new List<int>(); //l.Add(1000); //l.Add(2000); //j.Push("ListList", l); //var dd = new Dictionary<int, int>(); //dd.Add(1000, 1000); //dd.Add(2000, 2000); //var dhs = new HashSet<Dictionary<int, int>>(); //for (int i = 0; i < 2; ++i) // dhs.Add(dd); //j.Push("DictionaryHashSet", dhs); // Console.WriteLine(j.ToString()); #elif false // MultiMap int v; Console.WriteLine("MultiSet"); var ms = new MultiSet <int>(); ms.Add(3); ms.Add(3); ms.Add(3); ms.Add(2); ms.Add(4); Console.WriteLine("Count : " + ms.Count); ms.RemoveLast(); ms.RemoveLast(); ms.RemoveLast(); ms.RemoveLast(); Console.WriteLine("All Datas"); foreach (var i in ms) { Console.WriteLine(i); } Console.WriteLine("MultiMap"); var mm = new MultiMap <int, int>(); mm.Add(3, 4); mm.Add(3, 5); mm.Add(3, 6); mm.Add(2, 2); Console.WriteLine("All Datas : " + mm.Count); foreach (var i in mm) { Console.WriteLine(i); } Console.WriteLine("ToArray"); var a = mm.ToArray(3); foreach (var i in a) { Console.WriteLine(i); } var it = mm.First(); Console.WriteLine("First"); Console.WriteLine(it); mm.RemoveFirst(); Console.WriteLine("First Removed Count : " + mm.Count); foreach (var i in mm) { Console.WriteLine(i); } #elif true // Resize var l = new List <int>(); l.Resize(4); #endif }
public void TestExcept() { MultiSet<String> requirements = new MultiSet<string>(); requirements.Add("DAIRY"); requirements.Add("PLANT"); requirements.Add("PLANT"); MultiSet<String> available = new MultiSet<string>(); available.Add("PLANT"); available.Add("PLANT"); available.Add("DAIRY"); Assert.AreEqual(requirements.Except(available), new MultiSet<string>()); Assert.AreEqual(available.Except(requirements), new MultiSet<String>()); }
private static void EncodeInternal(Stream input, Stream output, bool xor, long inputLength) { var rleSource = new List <NibbleRun>(); var counts = new SortedList <NibbleRun, long>(); using (IEnumerator <byte> unpacked = Unpacked(input)) { // Build RLE nibble runs, RLE-encoding the nibble runs as we go along. // Maximum run length is 8, meaning 7 repetitions. if (unpacked.MoveNext()) { NibbleRun current = new NibbleRun(unpacked.Current, 0); while (unpacked.MoveNext()) { NibbleRun next = new NibbleRun(unpacked.Current, 0); if (next.Nibble != current.Nibble || current.Count >= 7) { rleSource.Add(current); long count; counts.TryGetValue(current, out count); counts[current] = count + 1; current = next; } else { ++current.Count; } } } } // We will use the Package-merge algorithm to build the optimal length-limited // Huffman code for the current file. To do this, we must map the current // problem onto the Coin Collector's problem. // Build the basic coin collection. var qt = new List <EncodingCodeTreeNode>(); foreach (var kvp in counts) { // No point in including anything with weight less than 2, as they // would actually increase compressed file size if used. if (kvp.Value > 1) { qt.Add(new EncodingCodeTreeNode(kvp.Key, kvp.Value)); } } qt.Sort(); // The base coin collection for the length-limited Huffman coding has // one coin list per character in length of the limmitation. Each coin list // has a constant "face value", and each coin in a list has its own // "numismatic value". The "face value" is unimportant in the way the code // is structured below; the "numismatic value" of each coin is the number // of times the underlying nibble run appears in the source file. // This will hold the Huffman code map. // NOTE: while the codes that will be written in the header will not be // longer than 8 bits, it is possible that a supplementary code map will // add "fake" codes that are longer than 8 bits. var codeMap = new SortedList <NibbleRun, KeyValuePair <long, byte> >(); // Size estimate. This is used to build the optimal compressed file. long sizeEstimate = long.MaxValue; // We will solve the Coin Collector's problem several times, each time // ignoring more of the least frequent nibble runs. This allows us to find // *the* lowest file size. while (qt.Count > 1) { // Make a copy of the basic coin collection. var q0 = new List <EncodingCodeTreeNode>(qt); // Ignore the lowest weighted item. Will only affect the next iteration // of the loop. If it can be proven that there is a single global // minimum (and no local minima for file size), then this could be // simplified to a binary search. qt.RemoveAt(qt.Count - 1); // We now solve the Coin collector's problem using the Package-merge // algorithm. The solution goes here. var solution = new List <EncodingCodeTreeNode>(); // This holds the packages from the last iteration. var q = new List <EncodingCodeTreeNode>(q0); int target = (q0.Count - 1) << 8, idx = 0; while (target != 0) { // Gets lowest bit set in its proper place: int val = (target & -target), r = 1 << idx; // Is the current denomination equal to the least denomination? if (r == val) { // If yes, take the least valuable node and put it into the solution. solution.Add(q[q.Count - 1]); q.RemoveAt(q.Count - 1); target -= r; } // The coin collection has coins of values 1 to 8; copy from the // original in those cases for the next step. var q1 = new List <EncodingCodeTreeNode>(); if (idx < 7) { q1.AddRange(q0); } // Split the current list into pairs and insert the packages into // the next list. while (q.Count > 1) { EncodingCodeTreeNode child1 = q[q.Count - 1]; q.RemoveAt(q.Count - 1); EncodingCodeTreeNode child0 = q[q.Count - 1]; q.RemoveAt(q.Count - 1); q1.Add(new EncodingCodeTreeNode(child0, child1)); } idx++; q.Clear(); q.AddRange(q1); q.Sort(); } // The Coin Collector's problem has been solved. Now it is time to // map the solution back into the length-limited Huffman coding problem. // To do that, we iterate through the solution and count how many times // each nibble run has been used (remember that the coin collection had // had multiple coins associated with each nibble run) -- this number // is the optimal bit length for the nibble run. var baseSizeMap = new SortedList <NibbleRun, long>(); foreach (var item in solution) { item.Traverse(baseSizeMap); } // With the length-limited Huffman coding problem solved, it is now time // to build the code table. As input, we have a map associating a nibble // run to its optimal encoded bit length. We will build the codes using // the canonical Huffman code. // To do that, we must invert the size map so we can sort it by code size. var sizeOnlyMap = new MultiSet <long>(); // This map contains lots more information, and is used to associate // the nibble run with its optimal code. It is sorted by code size, // then by frequency of the nibble run, then by the nibble run. var sizeMap = new MultiSet <SizeMapItem>(); foreach (var item in baseSizeMap) { long size = item.Value; sizeOnlyMap.Add(size); sizeMap.Add(new SizeMapItem(size, counts[item.Key], item.Key)); } // We now build the canonical Huffman code table. // "baseCode" is the code for the first nibble run with a given bit length. // "carry" is how many nibble runs were demoted to a higher bit length // at an earlier step. // "cnt" is how many nibble runs have a given bit length. long baseCode = 0; long carry = 0, cnt; // This list contains the codes sorted by size. var codes = new List <KeyValuePair <long, byte> >(); for (byte j = 1; j <= 8; j++) { // How many nibble runs have the desired bit length. cnt = sizeOnlyMap.Count(j) + carry; carry = 0; for (int k = 0; k < cnt; k++) { // Sequential binary numbers for codes. long code = baseCode + k; long mask = (1L << j) - 1; // We do not want any codes composed solely of 1's or which // start with 111111, as that sequence is reserved. if ((j <= 6 && code == mask) || (j > 6 && code == (mask & ~((1L << (j - 6)) - 1)))) { // We must demote this many nibble runs to a longer code. carry = cnt - k; cnt = k; break; } codes.Add(new KeyValuePair <long, byte>(code, j)); } // This is the beginning bit pattern for the next bit length. baseCode = (baseCode + cnt) << 1; } // With the canonical table build, the codemap can finally be built. var tempCodemap = new SortedList <NibbleRun, KeyValuePair <long, byte> >(); using (IEnumerator <SizeMapItem> enumerator = sizeMap.GetEnumerator()) { int pos = 0; while (enumerator.MoveNext() && pos < codes.Count) { tempCodemap[enumerator.Current.NibbleRun] = codes[pos]; ++pos; } } // We now compute the final file size for this code table. // 2 bytes at the start of the file, plus 1 byte at the end of the // code table. long tempsize_est = 3 * 8; byte last = 0xff; // Start with any nibble runs with their own code. foreach (var item in tempCodemap) { // Each new nibble needs an extra byte. if (item.Key.Nibble != last) { tempsize_est += 8; last = item.Key.Nibble; } // 2 bytes per nibble run in the table. tempsize_est += 2 * 8; // How many bits this nibble run uses in the file. tempsize_est += counts[item.Key] * item.Value.Value; } // Supplementary code map for the nibble runs that can be broken up into // shorter nibble runs with a smaller bit length than inlining. var supCodemap = new Dictionary <NibbleRun, KeyValuePair <long, byte> >(); // Now we will compute the size requirements for inline nibble runs. foreach (var item in counts) { if (!tempCodemap.ContainsKey(item.Key)) { // Nibble run does not have its own code. We need to find out if // we can break it up into smaller nibble runs with total code // size less than 13 bits or if we need to inline it (13 bits). if (item.Key.Count == 0) { // If this is a nibble run with zero repeats, we can't break // it up into smaller runs, so we inline it. tempsize_est += (6 + 7) * item.Value; } else if (item.Key.Count == 1) { // We stand a chance of breaking the nibble run. // This case is rather trivial, so we hard-code it. // We can break this up only as 2 consecutive runs of a nibble // run with count == 0. KeyValuePair <long, byte> value; if (!tempCodemap.TryGetValue(new NibbleRun(item.Key.Nibble, 0), out value) || value.Value > 6) { // The smaller nibble run either does not have its own code // or it results in a longer bit code when doubled up than // would result from inlining the run. In either case, we // inline the nibble run. tempsize_est += (6 + 7) * item.Value; } else { // The smaller nibble run has a small enough code that it is // more efficient to use it twice than to inline our nibble // run. So we do exactly that, by adding a (temporary) entry // in the supplementary codemap, which will later be merged // into the main codemap. long code = value.Key; byte len = value.Value; code = (code << len) | code; len <<= 1; tempsize_est += len * item.Value; supCodemap[item.Key] = new KeyValuePair <long, byte>(code, (byte)(0x80 | len)); } } else { // We stand a chance of breaking the nibble run. byte n = item.Key.Count; // This is a linear optimization problem subjected to 2 // constraints. If the number of repeats of the current nibble // run is N, then we have N dimensions. // Reference to table of linear coefficients. This table has // N columns for each line. byte[,] myLinearCoeffs = linearCoeffs[n - 2]; int rows = myLinearCoeffs.GetLength(0); byte nibble = item.Key.Nibble; // List containing the code length of each nibble run, or 13 // if the nibble run is not in the codemap. var runlen = new List <long>(); // Initialize the list. for (byte i = 0; i < n; i++) { // Is this run in the codemap? KeyValuePair <long, byte> value; if (tempCodemap.TryGetValue(new NibbleRun(nibble, i), out value)) { // It is. // Put code length in the vector. runlen.Add(value.Value); } else { // It is not. // Put inline length in the vector. runlen.Add(6 + 7); } } // Now go through the linear coefficient table and tally up // the total code size, looking for the best case. // The best size is initialized to be the inlined case. long bestSize = 6 + 7; int bestLine = -1; for (int i = 0; i < rows; i++) { // Tally up the code length for this coefficient line. long len = 0; for (byte j = 0; j < n; j++) { byte c = myLinearCoeffs[i, j]; if (c == 0) { continue; } len += c * runlen[j]; } // Is the length better than the best yet? if (len < bestSize) { // If yes, store it as the best. bestSize = len; bestLine = i; } } // Have we found a better code than inlining? if (bestLine >= 0) { // We have; use it. To do so, we have to build the code // and add it to the supplementary code table. long code = 0, len = 0; for (byte i = 0; i < n; i++) { byte c = myLinearCoeffs[bestLine, i]; if (c == 0) { continue; } // Is this run in the codemap? KeyValuePair <long, byte> value; if (tempCodemap.TryGetValue(new NibbleRun(nibble, i), out value)) { // It is; it MUST be, as the other case is impossible // by construction. for (int j = 0; j < c; j++) { len += value.Value; code <<= value.Value; code |= value.Key; } } } if (len != bestSize) { // ERROR! DANGER! THIS IS IMPOSSIBLE! // But just in case... tempsize_est += (6 + 7) * item.Value; } else { // By construction, best_size is at most 12. byte c = (byte)bestSize; // Add it to supplementary code map. supCodemap[item.Key] = new KeyValuePair <long, byte>(code, (byte)(0x80 | c)); tempsize_est += bestSize * item.Value; } } else { // No, we will have to inline it. tempsize_est += (6 + 7) * item.Value; } } } } // Merge the supplementary code map into the temporary code map. foreach (var item in supCodemap) { tempCodemap[item.Key] = item.Value; } // Round up to a full byte. tempsize_est = (tempsize_est + 7) & ~7; // Is this iteration better than the best? if (tempsize_est < sizeEstimate) { // If yes, save the codemap and file size. codeMap = tempCodemap; sizeEstimate = tempsize_est; } } // We now have a prefix-free code map associating the RLE-encoded nibble // runs with their code. Now we write the file. // Write header. BigEndian.Write2(output, (ushort)((Convert.ToInt32(xor) << 15) | ((int)inputLength >> 5))); byte lastNibble = 0xff; foreach (var item in codeMap) { byte length = item.Value.Value; // length with bit 7 set is a special device for further reducing file size, and // should NOT be on the table. if ((length & 0x80) != 0) { continue; } NibbleRun nibbleRun = item.Key; if (nibbleRun.Nibble != lastNibble) { // 0x80 marks byte as setting a new nibble. NeutralEndian.Write1(output, (byte)(0x80 | nibbleRun.Nibble)); lastNibble = nibbleRun.Nibble; } long code = item.Value.Key; NeutralEndian.Write1(output, (byte)((nibbleRun.Count << 4) | length)); NeutralEndian.Write1(output, (byte)code); } // Mark end of header. NeutralEndian.Write1(output, 0xff); // Write the encoded bitstream. UInt8_E_L_OutputBitStream bitStream = new UInt8_E_L_OutputBitStream(output); // The RLE-encoded source makes for a far faster encode as we simply // use the nibble runs as an index into the map, meaning a quick binary // search gives us the code to use (if in the map) or tells us that we // need to use inline RLE. foreach (var nibbleRun in rleSource) { KeyValuePair <long, byte> value; if (codeMap.TryGetValue(nibbleRun, out value)) { long code = value.Key; byte len = value.Value; // len with bit 7 set is a device to bypass the code table at the // start of the file. We need to clear the bit here before writing // the code to the file. len &= 0x7f; // We can have codes in the 9-12 range due to the break up of large // inlined runs into smaller non-inlined runs. Deal with those high // bits first, if needed. if (len > 8) { bitStream.Write((byte)((code >> 8) & 0xff), len - 8); len = 8; } bitStream.Write((byte)(code & 0xff), len); } else { bitStream.Write(0x3f, 6); bitStream.Write(nibbleRun.Count, 3); bitStream.Write(nibbleRun.Nibble, 4); } } // Fill remainder of last byte with zeroes and write if needed. bitStream.Flush(false); }
static void Main(string[] args) { MultiSet <string> urlCount = new MultiSet <string>(); MultiSet <string> domainCount = new MultiSet <string>(); Dictionary <string, Set <string> > domainToUrlMapping = new Dictionary <string, Set <string> >(); Dictionary <string, Dictionary <string, Set <string> > > data = new Dictionary <string, Dictionary <string, Set <string> > >(); Dictionary <string, Dictionary <string, Set <string> > > domainData = new Dictionary <string, Dictionary <string, Set <string> > >(); using (SqlConnection connection = new SqlConnection(Utils.GetConfigValue("DbConnectionString"))) { connection.Open(); using (SqlCommand cmd = new SqlCommand(@"SELECT name, responseUrl from Documents", connection)) { cmd.CommandTimeout = 0; using (SqlDataReader reader = cmd.ExecuteReader()) { //foreach (string fileName in Directory.GetFiles(Utils.GetConfigValue("DataFolder", ".").TrimEnd('\\'), "*.xml.gz", SearchOption.AllDirectories)) while (reader.Read()) { //Console.WriteLine(fileName); //Document doc = new Document("", ""); //doc.ReadXmlCompressed(fileName); //Console.WriteLine(doc.Name); Console.WriteLine(reader.GetValue <string>("name")); //string url = doc.Features.GetFeatureValue("responseUrl"); string url = reader.GetValue <string>("responseUrl"); //Console.WriteLine(url); string left; ArrayList <string> path; ArrayList <KeyDat <string, string> > qParsed; ParseUrl(url, out left, out path, out qParsed); string urlKey = UrlAsString(left, path, qParsed, new Set <string>()); urlCount.Add(urlKey); domainCount.Add(left); if (!domainToUrlMapping.ContainsKey(left)) { domainToUrlMapping.Add(left, new Set <string>()); } domainToUrlMapping[left].Add(urlKey); if (!data.ContainsKey(urlKey)) { data.Add(urlKey, new Dictionary <string, Set <string> >()); } if (!domainData.ContainsKey(left)) { domainData.Add(left, new Dictionary <string, Set <string> >()); } Dictionary <string, Set <string> > urlInfo = data[urlKey]; Dictionary <string, Set <string> > domainInfo = domainData[left]; foreach (KeyDat <string, string> item in qParsed) { //Console.WriteLine(item.Key + "=" + item.Dat); if (!urlInfo.ContainsKey(item.Key)) { urlInfo.Add(item.Key, new Set <string>()); } urlInfo[item.Key].Add(item.Dat); if (!domainInfo.ContainsKey(item.Key)) { domainInfo.Add(item.Key, new Set <string>()); } domainInfo[item.Key].Add(item.Dat); } } } } } Set <string> paramShitList = new Set <string>(Utils.GetConfigValue("ExcludeUrlArgs", "utm_campaign,feedName,mod,rss_id,comment,commentid,partner").Split(',')); StreamWriter w = new StreamWriter(Utils.GetConfigValue("OutputFileName", "reportDomains.txt")); foreach (KeyValuePair <string, Dictionary <string, Set <string> > > item in domainData) { bool found = false; foreach (KeyValuePair <string, Set <string> > paramInfo in item.Value) { if (paramInfo.Value.Count > 1 && !paramShitList.Contains(paramInfo.Key.ToLower())) { found = true; break; } } if (found) { bool __found = false; StringBuilder s = new StringBuilder(); s.AppendLine("********************** Domain Info **********************"); s.AppendLine(); s.AppendLine(item.Key + " (" + domainCount.GetCount(item.Key) + ")"); foreach (KeyValuePair <string, Set <string> > paramInfo in item.Value) { if (!paramShitList.Contains(paramInfo.Key) && paramInfo.Value.Count > 1) { s.AppendLine("\t" + paramInfo.Key + "\t" + paramInfo.Value.Count + "\t" + paramInfo.Value); } } s.AppendLine(); s.AppendLine("*** Details ***"); s.AppendLine(); foreach (string url in domainToUrlMapping[item.Key]) { bool _found = false; foreach (KeyValuePair <string, Set <string> > paramInfo in data[url]) { if (paramInfo.Value.Count > 1 && !paramShitList.Contains(paramInfo.Key)) { _found = true; break; } } if (_found) { __found = true; s.AppendLine(url + " (" + urlCount.GetCount(url) + ")"); foreach (KeyValuePair <string, Set <string> > paramInfo in data[url]) { if (paramInfo.Value.Count > 1) { s.AppendLine("\t" + paramInfo.Key + "\t" + paramInfo.Value.Count + "\t" + paramInfo.Value); } } s.AppendLine(); } } s.AppendLine(); if (__found) { w.Write(s.ToString()); } } } w.Close(); }