public PivotTable GetSparseSimilarites(PivotTableEntry baseVector, PivotTable vectors, bool logarithm, bool onlyBase) { this.pbarUpdate(vectors.Count, 0, 0); PivotTable outMap = new PivotTable(); int i = 0; foreach (PivotTableEntry b in vectors) { PivotTableAnalysisResult similarity = GetSparseSimilarity(baseVector, b, logarithm, onlyBase); similarity.Data.Add("cos_sim", similarity.prob); Dictionary <String, double> diffData = CalculateDiffs(baseVector, b); foreach (String key in diffData.Keys) { if (!similarity.Data.ContainsKey(key)) { similarity.Data.Add(key, diffData[key]); } } outMap.Add(similarity); this.pbarValueUpdate(i); i++; } return(outMap); }
/// <summary> /// Returns a PivotTableAnalysisResult with an empty Data dictionary but with a prob /// score set to the cosine similarity value of the two input vector spaces. /// Look here for more info: https://upload.wikimedia.org/math/4/e/4/4e45dc7ae582130813e804f793f24ead.png ///</summary> /// /// <param name="a"></param>The base vectors /// <param name="b"></param> /// <param name="logarithm"></param> /// <param name="onlyBase"></param> /// <returns></returns> public PivotTableAnalysisResult GetSparseSimilarity(PivotTableEntry a, PivotTableEntry b, bool logarithm, bool onlyBase) { if (a == null || b == null) { throw new Exception("neither a nor b are allowed to be null"); } PivotTableAnalysisResult prob = new PivotTableAnalysisResult(); Double aSoS = 0d, bSoS = 0d, dotProd = 0d; HashSet <string> keys = new HashSet <string>(); foreach (string k in a.Data.Keys) { if (k != "OBJECTID") { keys.Add(k); } } if (!onlyBase) { foreach (string k in b.Data.Keys) { if (k != "OBJECTID") { keys.Add(k); } } } foreach (string key in keys) { if (key == "OBJECTID") { continue; } double x = 0d; double y = 0d; if (a.Data.ContainsKey(key)) { x = a.Data[key]; } if (b.Data.ContainsKey(key)) { y = b.Data[key]; } if (logarithm) { x = Math.Log10(x + 1); y = Math.Log10(y + 1); } aSoS += x * x; bSoS += y * y; dotProd += x * y; } if (dotProd == 0) { return(new PivotTableAnalysisResult() { prob = -1d, RowKey = b.RowKey, Context = b.Context }); } double div = (Math.Sqrt(aSoS) * Math.Sqrt(bSoS)); if (div == 0d) { return(new PivotTableAnalysisResult() { prob = -1d, RowKey = b.RowKey, Context = b.Context }); } Double similarity = dotProd / div; PivotTableAnalysisResult idprob = new PivotTableAnalysisResult() { prob = similarity, RowKey = b.RowKey, Context = b.Context }; return(idprob); }
/// <summary> /// Compares two pivot tables. Do not pass in columns that don't make sense to compare. This method encapsulates a cosine similarity /// calculation on geohash cell pairs, and subsequently, each pair also calculates a diff between each col pair as a quasi percentage diff. /// /// </summary> /// <param name="timeA"></param> /// <param name="timeB"></param>A PivotTable that is full /// <returns></returns> public PivotTable DetectChange(PivotTable ptA, PivotTable ptB, string label, bool diffs) { PivotTable outList = new PivotTable(); //each dictionary below is a geohash agg layer, key=aGeoHashPrefix,value=anAggVectorOfThatBox Dictionary <string, PivotTableEntry> a = new Dictionary <string, PivotTableEntry>(); Dictionary <string, PivotTableEntry> b = new Dictionary <string, PivotTableEntry>(); HashSet <string> hashset = new HashSet <string>(); //union the key sets into hashset variable foreach (PivotTableEntry av in ptA) { a.Add(av.RowKey, av); hashset.Add(av.RowKey); } foreach (PivotTableEntry av in ptB) { b.Add(av.RowKey, av); hashset.Add(av.RowKey); } this.pbarUpdate.Invoke(hashset.Count, 0, 0); //now hashset variable is a unique list of strings Dictionary <string, double> empty = new Dictionary <string, double>(); foreach (String s in hashset) { empty.Add(s, 0d); } int x = 0; foreach (string geohash in hashset) { this.pbarValueUpdate.Invoke(x); x++; PivotTableEntry ava = null; PivotTableEntry avb = null; if (a.ContainsKey(geohash)) { ava = a[geohash]; } if (b.ContainsKey(geohash)) { avb = b[geohash]; } if (ava == null || avb == null) { outList.Add(new PivotTableAnalysisResult() { RowKey = geohash, prob = 0d, Data = empty, Label = label }); } else { PivotTableAnalysisResult p = GetSparseSimilarity(ava, avb, true, false); p.RowKey = geohash; p.Label = label; if (diffs) { p.Data = CalculateDiffs(ava, avb); } else { p.Data = new Dictionary <string, double>(); } p.Data.Add("cos_sim", p.prob); p.Data.Add("percent_change", Math.Abs(p.prob - 1) * 100); outList.Add(p); } } return(outList); }
/// <summary> /// Returns a PivotTableAnalysisResult with an empty Data dictionary but with a prob /// score set to the cosine similarity value of the two input vector spaces. /// Look here for more info: https://upload.wikimedia.org/math/4/e/4/4e45dc7ae582130813e804f793f24ead.png ///</summary> /// /// <param name="a"></param>The base vectors /// <param name="b"></param> /// <param name="logarithm"></param> /// <param name="onlyBase"></param> /// <returns></returns> public PivotTableAnalysisResult GetSparseSimilarity(PivotTableEntry a, PivotTableEntry b, bool logarithm, bool onlyBase) { if (a == null || b == null) { throw new Exception("neither a nor b are allowed to be null"); } PivotTableAnalysisResult prob = new PivotTableAnalysisResult(); Double aSoS = 0d, bSoS = 0d, dotProd = 0d; HashSet<string> keys = new HashSet<string>(); foreach (string k in a.Data.Keys) { if (k != "OBJECTID") { keys.Add(k); } } if (!onlyBase) { foreach (string k in b.Data.Keys) { if (k != "OBJECTID") { keys.Add(k); } } } foreach (string key in keys) { if (key == "OBJECTID") { continue; } double x = 0d; double y = 0d; if (a.Data.ContainsKey(key)) { x = a.Data[key]; } if (b.Data.ContainsKey(key)) { y = b.Data[key]; } if (logarithm) { x = Math.Log10(x + 1); y = Math.Log10(y + 1); } aSoS += x * x; bSoS += y * y; dotProd += x * y; } if (dotProd == 0) { return new PivotTableAnalysisResult() { prob = -1d, RowKey = b.RowKey, Context = b.Context }; } double div = (Math.Sqrt(aSoS) * Math.Sqrt(bSoS)); if (div == 0d) { return new PivotTableAnalysisResult() { prob = -1d, RowKey = b.RowKey, Context = b.Context }; } Double similarity = dotProd / div; PivotTableAnalysisResult idprob = new PivotTableAnalysisResult() { prob = similarity, RowKey = b.RowKey, Context = b.Context }; return idprob; }