Пример #1
0
        public void TestGetDigrams()
        {
            string[] digrams = StringTokenizer.GetDigrams(new string[]
                                                          { "c'mon", "tell", "me", "all", "your", "secrets", ".", "i", "won't", "tell", "anyone", "." });

            Assert.AreEqual(8, digrams.Length);
            Assert.IsTrue(digrams.All(x => !x.Contains(".")));
            Assert.IsTrue(digrams.All(x => x != "secrets i"));
            Assert.IsTrue(digrams.Any(x => x != "c'mon tell"));
            Assert.IsTrue(digrams.Any(x => x != "tell anyone"));
        }
Пример #2
0
        public static HttpResponseMessage Run([HttpTrigger(AuthorizationLevel.Function, "get", "post", Route = "getfrequencies")]
                                              HttpRequestMessage req, TraceWriter log)
        {
            Startup.Init();

            ImageAnalysisTableAdapter imageAnalysisTableAdapter = new ImageAnalysisTableAdapter();

            imageAnalysisTableAdapter.Init();

            TokenAllocationTableAdapter tokenAllocationTableAdapter = new TokenAllocationTableAdapter();

            tokenAllocationTableAdapter.Init();

            List <ImageAnalysisEntity> analyses = imageAnalysisTableAdapter.GetAllCanonical();

            Dictionary <string, int> digramFrequencies = new Dictionary <string, int>();
            Dictionary <string, int> labelFrequencies  = new Dictionary <string, int>();

            int processedCount = 0;

            foreach (ImageAnalysisEntity entity in analyses)
            {
                ImageAnalysis canonicalAnalysis = JsonConvert.DeserializeObject <ImageAnalysis>(entity.CanonicalJson);

                UpdateCounts(StringTokenizer.GetDigrams(canonicalAnalysis.TokenizedText), digramFrequencies);
                UpdateCounts(canonicalAnalysis.Labels.Keys.Distinct(), labelFrequencies);
                processedCount++;
                if (processedCount % 100 == 0)
                {
                    log.Info($"Processed frequencies for {processedCount} image analyses");
                }
            }

            log.Info($"Inserting {digramFrequencies.Count} digrams");
            tokenAllocationTableAdapter.InsertFrequencies(TokenAllocationTableAdapter.PartitionDigram, digramFrequencies);

            log.Info($"Inserting {labelFrequencies.Count} labels");
            tokenAllocationTableAdapter.InsertFrequencies(TokenAllocationTableAdapter.PartitionLabel, labelFrequencies);

            return(req.CreateResponse(HttpStatusCode.OK, $"Processed {analyses.Count} analyses"));
        }