public void TestGetDigrams() { string[] digrams = StringTokenizer.GetDigrams(new string[] { "c'mon", "tell", "me", "all", "your", "secrets", ".", "i", "won't", "tell", "anyone", "." }); Assert.AreEqual(8, digrams.Length); Assert.IsTrue(digrams.All(x => !x.Contains("."))); Assert.IsTrue(digrams.All(x => x != "secrets i")); Assert.IsTrue(digrams.Any(x => x != "c'mon tell")); Assert.IsTrue(digrams.Any(x => x != "tell anyone")); }
public static HttpResponseMessage Run([HttpTrigger(AuthorizationLevel.Function, "get", "post", Route = "getfrequencies")] HttpRequestMessage req, TraceWriter log) { Startup.Init(); ImageAnalysisTableAdapter imageAnalysisTableAdapter = new ImageAnalysisTableAdapter(); imageAnalysisTableAdapter.Init(); TokenAllocationTableAdapter tokenAllocationTableAdapter = new TokenAllocationTableAdapter(); tokenAllocationTableAdapter.Init(); List <ImageAnalysisEntity> analyses = imageAnalysisTableAdapter.GetAllCanonical(); Dictionary <string, int> digramFrequencies = new Dictionary <string, int>(); Dictionary <string, int> labelFrequencies = new Dictionary <string, int>(); int processedCount = 0; foreach (ImageAnalysisEntity entity in analyses) { ImageAnalysis canonicalAnalysis = JsonConvert.DeserializeObject <ImageAnalysis>(entity.CanonicalJson); UpdateCounts(StringTokenizer.GetDigrams(canonicalAnalysis.TokenizedText), digramFrequencies); UpdateCounts(canonicalAnalysis.Labels.Keys.Distinct(), labelFrequencies); processedCount++; if (processedCount % 100 == 0) { log.Info($"Processed frequencies for {processedCount} image analyses"); } } log.Info($"Inserting {digramFrequencies.Count} digrams"); tokenAllocationTableAdapter.InsertFrequencies(TokenAllocationTableAdapter.PartitionDigram, digramFrequencies); log.Info($"Inserting {labelFrequencies.Count} labels"); tokenAllocationTableAdapter.InsertFrequencies(TokenAllocationTableAdapter.PartitionLabel, labelFrequencies); return(req.CreateResponse(HttpStatusCode.OK, $"Processed {analyses.Count} analyses")); }