// computes and steps along the gradient of public static double[] PerceptronStep(PINQueryable <Example> input, double[] normal, double epsilon) { // select the examples that are currently mis-labeled by the normal vector var errors = input.Where(x => x.label * x.vector.Select((v, i) => v * normal[i]).Sum() < 0.0); // fold the average error into the normal var newnormal = new double[normal.Length]; foreach (var coordinate in Enumerable.Range(0, normal.Length)) { newnormal[coordinate] = normal[coordinate] + errors.NoisyAverage(epsilon, x => x.label * x.vector[coordinate]); } return(newnormal); }
// computes the outer product of the data matrix with itself. if the data are centered, this is the covariance matrix public static double[][] Covariance(PINQueryable <double[]> input, int dimensions, double epsilon) { double[][] outer = new double[dimensions][]; foreach (var i in Enumerable.Range(0, dimensions)) { outer[i] = new double[dimensions]; foreach (var j in Enumerable.Range(0, dimensions)) { outer[i][j] = input.NoisyAverage(epsilon, x => x[i] * x[j]); } } return(outer); }
// runs one step of the iterative k-means algorithm. public static void kMeansStep(PINQueryable <double[]> input, double[][] centers, double epsilon) { // partition data set by the supplied centers; somewhat icky in pure LINQ... (( and it assumes centers[0] exists )) var parts = input.Partition(centers, x => NearestCenter(x, centers)); // update each of the centers foreach (var center in centers) { var part = parts[center]; foreach (var index in Enumerable.Range(0, center.Length)) { center[index] = part.NoisyAverage(epsilon, x => x[index]); } } }
// computes and steps along the gradient of the SVM objective function: Sum_i HingeLoss(1.0 - normal^Tx_i y_i) + ||w||_2^2 public static double[] SupportVectorStep(PINQueryable <Example> input, double[] normal, double epsilon) { // select the examples that are currently mis-labeled by the normal vector. also add some negative normal for our regularizer var errors = input.Where(x => x.Label * x.Vector.Select((v, i) => v * normal[i]).Sum() < 1.0) .Concat(Enumerable.Repeat(new Example(normal, -1.0), 10).AsQueryable()); // fold the average error into the normal var newnormal = new double[normal.Length]; foreach (var coordinate in Enumerable.Range(0, normal.Length)) { newnormal[coordinate] = normal[coordinate] + errors.NoisyAverage(epsilon, x => x.Label * x.Vector[coordinate]); } return(newnormal); }
public static void test3(String[] args) { // preparing a private data source var filename = @"..\..\test3_groupbyname.txt"; var data = File.ReadAllLines(filename).AsQueryable(); PINQAgentLogger agent = new PINQAgentLogger(filename); var text = new PINQueryable <string>(data, agent); var users = text.Select(line => line.Split(',')) .Where(x => x[1] == args[0]) .Where(x => x[3] == args[1]); Console.WriteLine(" 无噪声——患有癌症且地址为北京的病人有: " + users.count() + "人"); Console.WriteLine(" 有噪声——患有癌症且地址为北京的病人有: " + users.NoisyCount(10.0) + "人"); Console.ReadKey(); }
static void Main(string[] args) { /* Note: various DryadLINQ data sources are commented out, and replaced * with empty data sets to avoid compile errors. */ // open DryadLINQ data source. //var ddc = new DryadDataContext(@"file://\\sherwood-091\dryadlinqusers\mcsherry"); //IQueryable<string> searchesdata = ddc.GetPartitionedTable<string>("SearchLogs.txt", CompressionScheme.GZipFast); IQueryable <string> searchesdata = Enumerable.Empty <string>().AsQueryable(); // Encase data sources in PINQueryable privacy type. PINQueryable <string> searches = new PINQueryable <string>(searchesdata, new PINQAgentLogger("searches")); // extract fields, then restrict to searches for args[0] var searchsubset = searches.Select(x => x.Split(',')) .Where(x => x[20].ToLower() == args[0]); Console.WriteLine(args[0] + " count: " + searchsubset.NoisyCount(0.1)); #region Further analysis, and visualization. // open second data set, containing ip to latlon mappings. //IQueryable<string[]> iplatlondata = ddc.GetPartitionedTable<LineRecord>("IPtoLatLon.txt", CompressionScheme.GZipFast).Select(x => x.line.Split('\t')); IQueryable <string[]> iplatlondata = Enumerable.Empty <string>().Select(x => x.Split('\t')).AsQueryable(); PINQueryable <string[]> iplatlon = new PINQueryable <string[]>(iplatlondata, new PINQAgentLogger("iplatlon")); // extract the IP address, and clip off the final octet. var searchips = searchsubset.Select(x => x[0].Split('.')) .Where(x => x.Count() == 4) .Select(x => x[0] + "." + x[1] + "." + x[2] + ".0"); // join queries x address; get coords var coordinates = from x in searchips join y in iplatlon on x equals y[0] select new double[] { Convert.ToDouble(y.First()[1]) / 90.0, Convert.ToDouble(y.First()[2]) / 180.0 }; // prepare and output a html page visualization via virtual earth WriteHeader(args[0]); // output the header of the .html Histogram(coordinates, 100, ""); // analyze data, output contents WriteFooter(); // output the footer of the .html #endregion }
public static void debt(PINQueryable<Record> db, IQueryable<Record> source) { // Get average debt. Omit records with no debt? var q1 = db.Where(x => x.debt != 0); var q2 = source.Where(x => x.debt != 0); Expression<Func<Record, double>> debt = x => x.debt / 10000; // Divide by 10000 to ensure it is within the [-1,+1] range double avgDebt = q1.NoisyAverage(0.2, debt) * 10000; double cleanAvgDebt = q2.Average(debt) * 10000; Console.WriteLine("Noisy Schools with Debt: " + q1.NoisyCount(0.1) + "\t\t\t\tClean: " + q2.Count()); Console.WriteLine("Noisy Average Debt: $" + (1000 * avgDebt) + "\t\t\t\t\tClean: $" + (1000 * cleanAvgDebt)); double totalDebt = db.NoisySum(0.2, debt) * 10000; double cleanTotalDebt = source.Sum(debt) * 10000; Console.WriteLine("Total Debt: $" + (1000 * totalDebt) + "\t\t\t\t\t\tClean: $" + (1000 * cleanTotalDebt)); }
// computes and steps along the gradient of the logarithm of the Logistic Regression objective function public static double[] LogisticStep(PINQueryable <Example> input, double[] normal, double epsilon) { // compute the logistic probability of (xi, yi) under "normal", subtracted from (label + 1.0)/2.0 = target var errors = input.Select(x => new { vector = x.Vector, error = (x.Label + 1.0) / 2.0 - 1.0 / (1 + Math.Exp(-x.Vector.Select((v, i) => v * normal[i]).Sum())) }); // fold the average error into the normal var newnormal = new double[normal.Length]; foreach (var coordinate in Enumerable.Range(0, normal.Length)) { newnormal[coordinate] = normal[coordinate] + errors.NoisySum(epsilon, x => x.error * x.vector[coordinate]) * 0.00001; } return(newnormal); }
public static PINQueryable <int[]> BoundDegree(PINQueryable <int[]> edges, int bound) { // reduce the degree of the graph var clamped = edges.GroupBy(edge => edge[0]) // collect up edges by source .SelectMany(bound, group => group.Take(bound)) // only keep *bound* of them .GroupBy(edge => edge[1]) // collect up edges by target .SelectMany(bound, group => group.Take(bound)); // only keep *bound* of them // A more "privacy efficient" approach uses the generalized Distinct transformation. // The stability constant here is 4 instead of 4 * bound^2 using the GroupBy operations above. // clamped = edges.Distinct(bound, edge => edge[0]) // .Distinct(bound, edge => edge[1]) // symmetrize (if interested) and return. degree is now at most 2 * bound. return(clamped.Select(x => new int[] { x[1], x[0] }) .Concat(clamped) .Distinct()); }
static double[] GetErrorWholeCount(IQueryable <BSOM_DataSet_revised> data, PINQueryable <BSOM_DataSet_revised> search, double[] epsilons) { // Get true value int trueCount = data.Count(); // Get list to hold our answers double[] result = new double[epsilons.Length]; // Calculate differences short idx = 0; foreach (double ep in epsilons) { result[idx++] = trueCount - search.NoisyCount(ep); } return(result); }
static void TestPartitionWhere(IQueryable <BSOM_DataSet_revised> data, PINQueryable <BSOM_DataSet_revised> search, double[] epsilons) { // Group IDs by O1_PI_01 scores, then count items in each group // This is what is emulated by PINQ partition operator, below var result = data.GroupBy(x => x.O1_PI_01).Select( group => new { key = group.Key, count = group.Count() }); Console.WriteLine("Count of items in distinct O1_PI_01 groups"); foreach (var r in result) { Console.WriteLine(String.Format("Score {0}: {1}", r.key, r.count)); } // PINQ version, with partition instead of groupby // Partition is poorly documented, see example at // https://github.com/LLGemini/PINQ/blob/master/TestHarness/TestHarness.cs // Note we must explicitly give the keys here, so PINQ assumes // we must know something about the data already to use // this powerful operator // Note our keys and values must be of the same type, hence we use // string keys. These must also perfectly match the values // returned by the raw query. string[] keys = { "0.5000", "0.5500", "0.6000", "0.6500", "0.7000", "0.7500", "0.8000", "0.8500", "0.9000", "0.9500", "1.0000" }; var pinqQuerySet = search.Partition(keys, x => x.O1_PI_01); Console.WriteLine("Noisy Counts:"); foreach (double ep in epsilons) { foreach (string key in keys) { Console.WriteLine(String.Format("Epsilon {0}\tScore {1}:" + " {2}", ep, key, pinqQuerySet[key].NoisyCount(ep))); } Console.WriteLine("---"); } }
static void Main(string[] args) { // Read from data file, and insert into class string[] lines = System.IO.File.ReadAllLines(@"C:\Users\Administrator\Documents\Visual Studio 2010\Projects\DPLab\ConsoleApplication2\DPLabData.csv"); IList<Record> recList = new List<Record>(); foreach (string line in lines) { // Use "," as delimeter to break line into an array of ints string[] words = line.Split(','); // Create new Record Record rec = new Record(); rec.setAll(words); // set all properties // Add Record to List recList.Add(rec); } // Convert recList to iQueryable var source = recList.AsQueryable<Record>(); var agent = new PINQAgentBudget(5.0); var db = new PINQueryable<Record>(source, agent); distTotalRev(db, source); // Chart 1 (4 figures) (e = 1) Console.WriteLine("=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=\n"); distLocalRev(db, source); // Chart 2 (4 figures) (e = 1) Console.WriteLine("=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=\n"); distExpenses(db, source); // Chart 3 (4 figures) (e = 1) Console.WriteLine("=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=\n"); debt(db, source); // Figure 1, 2, and 3 (e = 0.5) Console.WriteLine("=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=\n"); costPerStudent(db, source); // Figure 4 (e = 0.2) highEnrollment(db, source); // Figure 5,6 (e = 0.4) lowEnrollment(db, source); // Figure 7,8 (e = 0.4) Console.WriteLine("=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=+~+=\n"); teachSalary(db, source); // Figure 9, 10, 11 (e = 0.5) Console.ReadLine(); //Pause }
static double[] GetErrorRangedCount(IQueryable <BSOM_DataSet_revised> data, PINQueryable <BSOM_DataSet_revised> search, double[] epsilons) { // Get true value double trueSum = data.Where( x => Convert.ToDouble(x.O1_PI_01) < 0.8).Count(); // Get list to hold our answers double[] result = new double[epsilons.Length]; // Calculate differences short idx = 0; foreach (double ep in epsilons) { result[idx++] = trueSum - search.Where(x => Convert.ToDouble( x.O1_PI_01) < 0.8).NoisyCount(ep); } return(result); }
static double[] GetErrorAverage(IQueryable <BSOM_DataSet_revised> data, PINQueryable <BSOM_DataSet_revised> search, double[] epsilons) { // Get true value double trueAvg = data.Average( x => Convert.ToDouble(x.O1_PI_01)); // Get list to hold our answers double[] result = new double[epsilons.Length]; // Calculate differences short idx = 0; foreach (double ep in epsilons) { result[idx++] = trueAvg - search.NoisyAverage( ep, x => Convert.ToDouble(x.O1_PI_01)); } return(result); }
public static void test5(String[] args) { // preparing a private data source var filename = @"..\..\processed.cleveland.data"; var data = File.ReadAllLines(filename, Encoding.UTF8).AsQueryable(); PINQAgentLogger agent = new PINQAgentLogger(filename); var text = new PINQueryable <string>(data, agent); var parts = text.Select(line => line.Split(',')) .Partition(args, fields => fields[13]); Console.WriteLine("不患心脏病的人数{0},加入噪声后:{1}", parts["0"].count(), parts["0"].NoisyCount(1.0)); Console.WriteLine("患1病的人数{0},加入噪声后:{1}", parts["1"].count(), parts["1"].NoisyCount(1.0)); Console.WriteLine("患2病的人数{0},加入噪声后:{1}", parts["2"].count(), parts["2"].NoisyCount(1.0)); Console.WriteLine("患3病的人数{0},加入噪声后:{1}", parts["3"].count(), parts["3"].NoisyCount(1.0)); Console.WriteLine("患4病的人数{0},加入噪声后:{1}", parts["4"].count(), parts["4"].NoisyCount(1.0)); Console.WriteLine("总人数:{0}", parts["0"].count() + parts["1"].count() + parts["2"].count() + parts["3"].count() + parts["4"].count()); Console.WriteLine(); Console.ReadKey(); }
static void TestExhaustedPrivacyBudget( IQueryable <BSOM_DataSet_revised> data) { // Note we need no LINQ version of this query, as there is no // privacy budget to compare to with it // We first need to have a PINQueryable object that actually // checks against a budget PINQueryable <BSOM_DataSet_revised> search = new PINQueryable <BSOM_DataSet_revised>( data, new PINQAgentBudget(50)); // Essentially apply transformations until we can't anymore // This will be done by repetitively using a 'where' tranform, // while incrementing the actual threhold we intend to cut Console.Write("Number of iterations we can do before privacy" + " budget is exhausted: "); double threshold = 0.1; int iters = 0; while (true) { // Do a selection of data var result = search.Where(x => Convert.ToDouble(x.O1_PI_01) > threshold); // Try to do a noisy count, breaking if we except try { result.NoisyCount(1); } catch (Exception e) { Console.WriteLine(iters); break; } // Increment threshold and counter threshold += 0.1; iters++; } }
public static void function1() { // preparing a private data source var filename = @"..\..\test2.txt"; var data = File.ReadAllLines(filename).AsQueryable(); PINQAgentLogger agent = new PINQAgentLogger(filename); var text = new PINQueryable <string>(data, agent); /**** Data is now sealed up. Use from this point on is unrestricted ****/ // output a noisy count of the number of lines of text Console.WriteLine("Lines of text: " + text.count() + " Lines of text: " + text.NoisyCount(1.0)); //Console.WriteLine("**privacy change**\tbudget:{0}", agent.getBudget()); // restrict using a user defined predicate, and count again (with noise) Console.WriteLine("Lines with semi-colons: " + text.Where(line => line.Contains(';')).NoisyCount(1.0)); //Console.WriteLine("**privacy change**\tbudget:{0}", agent.getBudget()); // think about splitting the records into arrays (declarative, so nothing happens yet) var words = text.Select(line => line.Split('*')); Console.WriteLine("words: {0}, words_noisy: {1}", words.count(), words.NoisyCount(1.0)); // partition the data by number of "words", and count how many of each type there are var keys = new int[] { 0, 1, 2, 3, 4, 5 }; var parts = words.Partition(keys, line => line.Count()); foreach (var count in keys) { Console.WriteLine(""); Console.WriteLine("Lines with " + count + " words(no noisy):" + "\t" + parts[count].count()); Console.WriteLine("Lines with " + count + " words(noisy):" + "\t" + parts[count].NoisyCount(1.0)); } Console.ReadKey(); }
private static void wrapPrecedenceTable(string precedenceFile) { IEnumerable <Event> events = cc.Read <Event>(precedenceFile, inputFileDescription); EventQuery = new PINQueryable <Event>(events.AsQueryable(), Agent); }
static void OtherMain(string[] args) { var dimensions = 8; var records = 10000; var sourcedata = GenerateData(dimensions).Take(records).ToArray().AsQueryable(); var securedata = new PINQueryable <double[]>(sourcedata, null); // let's start by computing the centroid of the data // var means = Mean(securedata, dimensions, 0.1); // // Console.WriteLine("mean vector:"); // foreach (var mean in means) // Console.Write("\t{0:F4}", mean); // Console.WriteLine(); // Console.WriteLine(); // // // // we can also center the data and compute its covariance // var centered = securedata.Select(x => x.Select((v, i) => v - means[i]).ToArray()); // var covariance = Covariance(centered, dimensions, 8); // // Console.WriteLine("covariance matrix:"); // foreach (var row in covariance) // { // foreach (var entry in row) // Console.Write("\t{0:F4}", entry); // Console.WriteLine(); // } // Console.WriteLine(); // iterative algorithms are also possible. we'll do k-means first var k = 3; var centers = GenerateData(dimensions).Take(k).ToArray(); var iterations = 5; foreach (var iteration in Enumerable.Range(0, iterations)) { kMeansStep(securedata, centers, 0.1); } Console.WriteLine("kMeans: {0} centers, {1} iterations", k, iterations); foreach (var center in centers) { foreach (var value in center) { Console.Write("\t{0:F4}", value); } Console.WriteLine(); } Console.WriteLine(); // Moving to supervised learning, let's label the points by whether they are nearest the first center or not var labeled = securedata.Select(x => new Example(x, NearestCenter(x, centers) == centers[0] ? 1.0 : -1.0)); // the Perceptron algorithm repeatedly adds misclassified examples to a normal vector // var perceptronnormal = GenerateData(dimensions).First(); // foreach (var index in Enumerable.Range(0, iterations)) // perceptronnormal = PerceptronStep(labeled, perceptronnormal, 0.1); // // var perceptronerror = labeled.NoisyAverage(0.1, x => x.label * x.vector.Select((v, i) => v * perceptronnormal[i]).Sum() < 0.0 ? 1.0 : 0.0); // Console.WriteLine("perceptron error rate:\t\t{0:F4}", perceptronerror); // // // the Support Vector Machine attempts to find a maximum margin classifier // var supportvectornormal = GenerateData(dimensions).First(); // foreach (var index in Enumerable.Range(0, iterations)) // supportvectornormal = SupportVectorStep(labeled, supportvectornormal, 0.1); // // var supportvectorerror = labeled.NoisyAverage(0.1, x => x.label * x.vector.Select((v, i) => v * supportvectornormal[i]).Sum() < 0.0 ? 1.0 : 0.0); // Console.WriteLine("support vector error rate:\t{0:F4}", supportvectorerror); // Logistic regression optimizes the likelihood of the labels under the logistic function var logisticnormal = GenerateData(dimensions).First(); foreach (var index in Enumerable.Range(0, iterations)) { logisticnormal = LogisticStep(labeled, logisticnormal, 0.1); } var logisticerror = labeled.NoisyAverage(0.1, x => x.Label * x.Vector.Select((v, i) => v * logisticnormal[i]).Sum() < 0.0 ? 1.0 : 0.0); Console.WriteLine("logistic error rate:\t\t{0:F4}", logisticerror); Console.ReadKey(); }
static void Main(string[] args) { var participants = 1000; var edges = 10000; var sourcegraph = GenerateData(participants).Take(edges).ToArray().AsQueryable(); var agent = new PINQAgentBudget(10000); var securegraph = new PINQueryable <int[]>(sourcegraph, agent); // we'll start by computing degree distributions var nodes = securegraph.GroupBy(x => x[0]); var nodeparts = nodes.Partition(Enumerable.Range(0, 20).ToArray(), x => x.Count()); foreach (var degree in Enumerable.Range(0, 20)) { Console.WriteLine("degree {0}:\t{1:F2}\t+/- {2:F2}", degree, nodeparts[degree].NoisyCount(0.1), 10.0); } Console.WriteLine(); // for a buch of the analyses, we want the degree to be bounded var bound = 10; var bounded = BoundDegree(securegraph, bound).Materialize(); // with a degree-bounded graph, we can measure things like assortativity. Each edge is joined using both of its endpoints. // this uses the "bounded-join", which imposes a limit on the number of records with each key, to bound the transformation's stability. var edgedegrees = securegraph.Join(nodes, edge => edge[0], node => node.Key, bound, bound, (edge, node) => new int[] { node.Count(), edge[1] }) .Join(nodes, edge => edge[1], node => node.Key, bound, bound, (edge, node) => new int[] { edge[0], node.Count() }); Console.WriteLine("Assortativity:"); var srcparts = edgedegrees.Partition(Enumerable.Range(8, 5).ToArray(), edge => edge[0]); foreach (var i in Enumerable.Range(8, 5)) { var dstparts = srcparts[i].Partition(Enumerable.Range(8, 5).ToArray(), edge => edge[1]); foreach (var j in Enumerable.Range(8, 5)) { Console.Write("\t{0:F2}", dstparts[j].NoisyCount(0.1)); } Console.WriteLine(); } Console.WriteLine(); // we can also measure the correlation coefficient: the number of triangles divided by the number of length two paths. var paths2 = ExtendPaths(bounded, bounded, bound, bound); var paths3 = ExtendPaths(paths2, bounded, bound * bound, bound); var triangles = paths3.Where(x => x[0] == x[3]); Console.WriteLine("Triangles:\t{0}", triangles.NoisyCount(0.1)); Console.WriteLine("Len 2 paths:\t{0}", paths2.NoisyCount(0.1)); Console.WriteLine(); // one way to view pagerank is the sum over all paths arriving at a vertex, of the probability of // traversing that path. usually this looks something like (alpha/degree)^length // although we'll have to have increasingly noisy counts with longer paths, to prevent privacy explosion, // the contributions of these terms are scaled down commensurately. var depth = 3; var paths = new PINQueryable <int[]> [depth]; paths[0] = bounded; foreach (var index in Enumerable.Range(1, depth - 1)) { paths[index] = ExtendPaths(paths[index - 1], bounded, Convert.ToInt32(Math.Pow(bound, index)), bound).Materialize(); } // for any set of endpoints (too small a set gives bad results, as privacy would dictate) we compute var pagerank = 0.0; foreach (var index in Enumerable.Range(0, depth)) { pagerank += paths[index].Where(path => path.Last() % 10 == 0) .NoisyCount(0.1 * Math.Pow(0.85 / bound, index)) * Math.Pow(0.85 / bound, index); Console.WriteLine("pagerank using paths of length at most {0}:\t{1}", index + 1, pagerank); } Console.ReadKey(); }
private static void wrapSequenceTable(string sequenceFile) { IEnumerable <Trace> traces = cc.Read <Trace>(sequenceFile, inputFileDescription); TraceQuery = new PINQueryable <Trace>(traces.AsQueryable(), Agent); }