public async Task <Guid> Train([FromUri] string blobContainerName, [FromUri] string trainDirectory, [FromBody] IEnumerable <Annotation> annotations) { // Check annoations var annotationsArr = annotations as Annotation[] ?? annotations.ToArray(); if (!annotationsArr.Any()) { throw new AggregateException("No annotations provided for learning"); } // Check blobs folder var storageContainer = this.storageClient.GetContainerReference(blobContainerName); if (!storageContainer.Exists()) { throw new AggregateException($"{blobContainerName} does not exists"); } var blobs = storageContainer.ListBlobs().ToArray(); var dir = storageContainer.GetDirectoryReference(trainDirectory); blobs = dir.ListBlobs().ToArray(); if (!blobs.Any()) { throw new AggregateException($"No blobs found in blobs container: {blobContainerName} folder: {trainDirectory}"); } // Read annotations var annotatedSamplesContentFetchingTasks = annotationsArr.Select(async annotation => { var blobRef = storageContainer.GetBlobReference(annotation.FileName); using (var stream = new MemoryStream()) { await blobRef.DownloadToStreamAsync(stream); stream.Position = 0; using (var sr = new StreamReader(stream)) { var content = await sr.ReadToEndAsync(); return(new Tuple <string, uint, uint>(content, (uint)annotation.StartOffset, (uint)annotation.EndOffset)); } } }); var annotatedSampleFileNames = annotationsArr.Select(a => a.FileName).ToHashSet(); var nonLabeledSampleBlobs = blobs.Where(b => !annotatedSampleFileNames.Contains(((CloudBlockBlob)b).Name)).Take(NonLabeledSampleSize); var nonLabeledSamplesContentFetchingTasks = nonLabeledSampleBlobs.Select(async b => { var blob = (CloudBlockBlob)b; return(await blob.DownloadTextAsync()); }); var examples = await Task.WhenAll(annotatedSamplesContentFetchingTasks); var nonLabeledSamples = await Task.WhenAll(nonLabeledSamplesContentFetchingTasks); // Train var extractor = await StructureExtractor.TrainExtractorAsync(examples, nonLabeledSamples); // Save program var serializedExtractor = extractor.Serialize(); var extractorGuid = Guid.NewGuid(); var blockBlob = storageContainer.GetBlockBlobReference($"{programsFolder}/{extractorGuid:D}"); await blockBlob.UploadTextAsync(serializedExtractor); return(extractorGuid); }
public override async Task ProcessDataAsync() { var scrapedSaleAnnounces = new List <TransportSaleAnnounce>(); while (ItemsCountForStep > scrapedSaleAnnounces.Count) { try { var urlForScrapping = UrlForScrapping + $"&start={ScrappingPage}"; var htmlDocument = await DataLoader.LoadHtmlDocumentAsync(urlForScrapping); var previewOffers = StructureExtractor.GetPreviewOfferStructure(htmlDocument, ScrappingPage).ToList(); for (int i = 0; i < previewOffers.Count; i++) { tryAgain: try { var sourceLink = SourceLinkParser.GetLink(previewOffers[i], BaseUrl); var detailedHtmlDocument = await DataLoader.LoadHtmlDocumentAsync(sourceLink); var detailedOfferNode = StructureExtractor.GetDetailedOfferStructure(detailedHtmlDocument); var(brandId, modelId) = BrandModelParser.ParseForDetailed(detailedOfferNode); var saleAnnounce = new TransportSaleAnnounce() { SourceLink = sourceLink, TransmissionTypeId = TransmissionTypeParser.ParseForDetailed(detailedOfferNode), AdNumber = OfferNumberParser.ParseForDetailed(detailedOfferNode), BodyTypeId = BodyTypeParser.ParseForDetailed(detailedOfferNode), CityId = RegionParser.ParseForDetailed(detailedOfferNode).Id, Description = DescriptionParser.ParseForDetailed(detailedOfferNode), DriveUnitId = DriveUnitParser.ParseForDetailed(detailedOfferNode), EngineVolumetric = EngineVolumetricParser.ParseForDetailed(detailedOfferNode), FuelTypeId = FuelTypeParser.ParseForDetailed(detailedOfferNode), PreviewImageLink = ImageLinkParser.ParseForPreview(previewOffers[i]), Mileage = MileageParser.ParseForDetailed(detailedOfferNode), BrandId = brandId, VehicleTypeId = VehicleTypeParser.ParseForDetailed(detailedOfferNode), ModelId = modelId, PriceInDollars = PriceParser.ParseForDetailed(detailedOfferNode), UpdateOfferTime = PublishDateParser.ParseForDetailed(detailedOfferNode), Year = YearParser.ParseForDetailed(detailedOfferNode), CreatedAt = DateTime.Now, SourceProviderId = (int)SourceProviderEnum.RST }; var carConditionIds = CarConditionParser.ParseForPreview(previewOffers[i]); foreach (var carConditionId in carConditionIds) { saleAnnounce.TransportConditions.Add(new TransportConditionInSaleAnnounce() { TransportConditionId = carConditionId }); } scrapedSaleAnnounces.Add(saleAnnounce); UnitOfWork.TransportSaleAnnouncesRepository.Add(saleAnnounce); await UnitOfWork.SaveChangesAsync(); ScrappingPage++; } catch (WebException) { goto tryAgain; } } } catch (Exception ex) { _logger.LogError($"{ex.Message}\n{ex.StackTrace}"); } } //await UnitOfWork.TransportSaleAnnouncesRepository.InsertRangeSaleAnnounces(scrapedSaleAnnounces); }
public async Task <IEnumerable <Annotation> > Extract([FromUri] string blobContainerName, [FromUri] string scoreDirectory, [FromUri] Guid programId) { // Check blobs folder var storageContainer = this.storageClient.GetContainerReference(blobContainerName); if (!storageContainer.Exists()) { throw new AggregateException($"{blobContainerName} does not exists"); } var dir = storageContainer.GetDirectoryReference(scoreDirectory); //var blobs = dir.ListBlobs().ToArray(); var blobFiles = dir.ListBlobs().Where(b => b as CloudBlockBlob != null).ToArray(); if (!blobFiles.Any()) { throw new AggregateException($"No blobs found in blobs folder {scoreDirectory}"); } var programBlobRef = storageContainer.GetBlobReference($"{programsFolder}/{programId:D}"); if (!programBlobRef.Exists()) { throw new AggregateException($"Program with id {programId:D} not found"); } // Load program string serializedProgram; using (var stream = new MemoryStream()) { await programBlobRef.DownloadToStreamAsync(stream); stream.Position = 0; using (var sr = new StreamReader(stream)) { serializedProgram = await sr.ReadToEndAsync(); } } var extractor = StructureExtractor.Deserialize(serializedProgram); // Extraction var extractionTasks = blobFiles.Select(async b => { var blob = (CloudBlockBlob)b; var content = await blob.DownloadTextAsync(); var stringRegion = extractor.Extract(content); if (stringRegion == null || string.IsNullOrEmpty(stringRegion.Value)) { return(new Annotation { FileName = blob.Name }); } return(new Annotation { FileName = blob.Name, StartOffset = (int)stringRegion.Start, EndOffset = (int)stringRegion.End }); }); return(await Task.WhenAll(extractionTasks)); }