static async Task Main(string[] args) { // Load appsettings.json var config = LoadAppSettings(); if (null == config) { Console.WriteLine("Missing or invalid appsettings.json file. Please see README.md for configuration instructions."); return; } SetGlobalConfig(config); searchServiceHelper = new SearchServiceHelper(SearchServiceName, SearchServiceAdminKey); System.Diagnostics.Trace.TraceWarning("Slow response - database01"); TimeSpan elapsedTime; //Start stopwatch for timing telemtry Stopwatch sw = new Stopwatch(); var timeStart = DateTime.Now; sw.Start(); //Storage var storageAccount = CloudStorageAccount.Parse(StorageConnectionString); var storageClient = storageAccount.CreateCloudBlobClient(); AzureTableStorage azTableStorage = new AzureTableStorage(StorageConnectionString, StorageTableName); AzureTableStorage azTableStorageSpoItems = new AzureTableStorage(StorageConnectionString, SpoItemStorageTableName); CloudBlobContainer container = await AzureBLOBStorage.CreateAzureBLOBContainer(storageClient, BlobContainerName); //Search AzureSearchServiceHelper searchClient = new AzureSearchServiceHelper(SearchServiceName, SearchServiceAdminKey); IDriveItemChildrenCollectionPage docLibItems; IDriveItemDeltaCollectionPage docLibDeltaItems; for (int i = 0; i < args.Length; i++) { if (args[i].ToLower() == "-incrementalcrawl") { IncrementalCrawl = true; Console.WriteLine("Search Crawl mode set to Incremental"); container = await AzureBLOBStorage.CreateAzureBLOBContainer(storageClient, BlobContainerName); } if (args[i].ToLower() == "-fullcrawl") { IncrementalCrawl = false; Console.WriteLine("Search Crawl mode set to Full"); await AzureBLOBStorage.DeleteContainerFromAzureBLOB(container); container = await AzureBLOBStorage.CreateAzureBLOBContainer(storageClient, BlobContainerName); } if (args[i].ToLower() == "-includeacls") { IncludeAcls = true; Console.WriteLine("Search Crawl mode set to Full"); } } SharePointOnlineHelper.metadataFieldsToIgnore = MetadataFieldsToIgnore; SharePointOnlineHelper.metadataJSONStore = MetadataJSONStore; SharePointOnlineHelper.acls = IncludeAcls; SharePointOnlineHelper.azTableStorage = azTableStorageSpoItems; foreach (var metadataFieldToIgnore in MetadataFieldsToIgnore) { Console.WriteLine("Removing key [{0}] from metadata fields to extract", metadataFieldToIgnore); } //Query using Graph SDK (preferred when possible) GraphServiceClient graphClient = SharePointOnlineHelper.GetAuthenticatedGraphClient(config); Site targetSite = await graphClient.Sites.GetByPath(SiteUrl, SPOHostName).Request().GetAsync(); ISiteDrivesCollectionPage drives = graphClient.Sites[targetSite.Id].Drives.Request().GetAsync().Result; //Graph BETA supports site pages //var sitePages = graphClient.Sites[targetSite.Id].Pages.Request().GetAsync().GetAwaiter().GetResult(); //var sitePages = graphClient.Sites[targetSite.Id].Pages.Request().GetAsync().Result; //var a = 1; foreach (var drive in drives) { var driveName = drive.Name; var driveUrl = drive.WebUrl; bool excludedDocLIb = Array.Exists(DocLibsToIgnore, element => element == driveName); if (excludedDocLIb) { Console.WriteLine("Skipping [{0}] as its an excluded docLib", DocLibsToIgnore); continue; } Console.WriteLine("Fetching items from drive [{0}]", driveName); var driveId = drive.Id; var driveContents = new List <DriveItem>(); //Full Crawl Logic if (!IncrementalCrawl) { docLibItems = await graphClient .Drives[driveId] .Root .Children .Request() .GetAsync(); driveContents.AddRange(docLibItems.CurrentPage); if (docLibItems.NextPageRequest != null) { while (docLibItems.NextPageRequest != null) { docLibItems = await docLibItems.NextPageRequest.GetAsync(); driveContents.AddRange(docLibItems.CurrentPage); await SharePointOnlineHelper.GetSpoDocumentItems(graphClient, driveContents, driveId, container, IncludeAcls); } } else { await SharePointOnlineHelper.GetSpoDocumentItems(graphClient, driveContents, driveId, container, IncludeAcls); } } //Incremental Crawl Logic if (IncrementalCrawl) { //Retrieve the last known deltaToken from Table storage, if the value is null it will fetch all items for that drive //Base64 encode the string to remove special characters byte[] byt = System.Text.Encoding.UTF8.GetBytes(driveUrl); var driveUrlEscpaed = Convert.ToBase64String(byt); var lastDeltaToken = await azTableStorage.GetEntitiesInPartion(driveUrlEscpaed); docLibDeltaItems = await graphClient .Drives[driveId] .Root .Delta(lastDeltaToken) .Request() .GetAsync(); var deltaLink = docLibDeltaItems.AdditionalData["@odata.deltaLink"].ToString(); if (deltaLink != null) { var tokenindex = deltaLink.IndexOf("token="); var token = deltaLink.Substring(tokenindex + 7, deltaLink.ToString().Length - tokenindex - 9); driveContents.AddRange(docLibDeltaItems.CurrentPage); if (docLibDeltaItems.NextPageRequest != null) { while (docLibDeltaItems.NextPageRequest != null) { var docLibItems2 = await docLibDeltaItems.NextPageRequest.GetAsync(); driveContents.AddRange(docLibItems2.CurrentPage); await SharePointOnlineHelper.GetSpoDocumentItems(graphClient, driveContents, driveId, container, IncludeAcls); } } else { await SharePointOnlineHelper.GetSpoDocumentItems(graphClient, driveContents, driveId, container, IncludeAcls); //Lets persist the changeToken to storage so we can continue the next incrmental crawl from this point. IndexCrawlEntity indexCrawlEntity = new IndexCrawlEntity(driveUrlEscpaed, token); azTableStorage.InsertEntity(indexCrawlEntity); } //Console.WriteLine("Fetched total of {0} documents from [{1}] data source", DownloadFileCount, driveName); } } } if (!IncrementalCrawl) { //Now lets do a full crawl of all the fetched SPO documents from the BLOB store as the fetching of all documents into storage would have completed by now //Warning this will perform an entire search index rebuild - so while this phase is running search resultset will be impacted await IndexDocumentsAsync(); } sw.Stop(); elapsedTime = sw.Elapsed; var timeEnd = DateTime.Now; Console.WriteLine("Fetched total of {0} documents during crawl", AzureBLOBStorage.DownloadFileCount); Console.WriteLine("Crawl Start time: {0}", timeStart); Console.WriteLine("Crawl Completed time: {0}", timeEnd); Console.WriteLine("Total crawl duration time: {0}", elapsedTime); }
public static async Task GetSpoDocumentItems(GraphServiceClient graphClient, List <DriveItem> docLibItems, string driveId, CloudBlobContainer container, bool getAcls) { foreach (var item in docLibItems) { if (item.Folder != null) { string ParentFolderPathString = null; string fullFolderNamePath = null; var folderName = item.Name; if (item.ParentReference.Path != null) { var ParentFolderPathSplit = item.ParentReference.Path.Split(":"); if (ParentFolderPathSplit.Length >= 1) { ParentFolderPathString = ParentFolderPathSplit[1]; if (ParentFolderPathString.Length >= 1) { fullFolderNamePath = String.Format("{0}/{1}", ParentFolderPathString, folderName); } else { fullFolderNamePath = folderName; } } } else { fullFolderNamePath = folderName; } var folderItems = await GetFolderContents(graphClient, fullFolderNamePath, driveId); if (folderItems.Count > 0) { await GetSpoDocumentItems(graphClient, folderItems, driveId, container, _getAcls); } } // Let's download the first file we get in the response. if (item.File != null) { // We'll use the file metadata to determine size and the name of the downloaded file // and to get the download URL. if (item.Deleted != null) { if (item.Deleted.State == "deleted") { Console.WriteLine("Deleted Item detected"); var spoItemUrl = await azTableStorage.GetSpoItemEntitiesInPartion(item.Id); //Clean up the Storage account path for the deleted item so we dont index it again await AzureBLOBStorage.DeleteFileFromAzureBLOB(spoItemUrl, container); //Clean up the json metadata file for the above file: string spoItemUrlJson = ($"{spoItemUrl}.json"); await AzureBLOBStorage.DeleteFileFromAzureBLOB(spoItemUrlJson, container); break; } } var driveItemInfo = await graphClient.Drives[driveId].Items[item.Id].Request().GetAsync(); var SPWebUrl = driveItemInfo.WebUrl; var createdAuthorDisplayName = driveItemInfo.CreatedBy.User.DisplayName; var baseFileName = SPWebUrl; var jsonMetadataFileName = String.Format("{0}.json", baseFileName); //Below is for ACL Security trimming extraction which is still work in progress. if (getAcls) { var driveItemPermissions = await graphClient.Drives[driveId].Items[item.Id].Permissions.Request().GetAsync(); foreach (var driveItemPermission in driveItemPermissions) { var grantedDispayName = driveItemPermission.GrantedTo.User.DisplayName; var grantedObjectId = driveItemPermission.GrantedTo.User.Id; //If no ID is present then its a sharepoint group if (grantedObjectId == null) { var scopes = new[] { _spoHostName + "/.default" }; //var scopes = new[] { _spoHostName + "/Sites.FullControl.All" }; //var scopes = new[] { "https://graph.microsoft.com/contacts.read" }; var v1Authority = _authority.Replace("/v2.0", ""); var clientApplication = ConfidentialClientApplicationBuilder.Create(_clientId) .WithAuthority(_authority) .WithClientSecret(_clientSecret) .WithClientId(_clientId) .WithTenantId(_tenantId) .Build(); var result = await clientApplication.AcquireTokenForClient(scopes).ExecuteAsync(); HttpClient client = new HttpClient(); client.DefaultRequestHeaders.Add("Authorization", "Bearer " + result.AccessToken); client.DefaultRequestHeaders.Add("Accept", "application/json"); ////setup the client get HttpResponseMessage result2 = await client.GetAsync(String.Format("{0}/_api/Web/SiteGroups/GetByName('{1}')/users", SPWebUrl, grantedDispayName)); string filter = string.Format("startswith(displayName, {0}", grantedDispayName); //string filter = string.Format("displayName startswith '{0}'", grantedDispayName); var groupLookup = await graphClient.Groups .Request() .Filter($"startswith(displayName, '{grantedDispayName}')") //.Filter(filter) .Select("id, displayName").GetAsync(); var ac = groupLookup; } } } var fields = await graphClient.Drives[driveId].Items[item.Id].ListItem.Fields.Request().GetAsync(); //generate metadata content and upload to blob var metadataFields = fields.AdditionalData; foreach (var metadataFieldToIgnore in metadataFieldsToIgnore) { //Console.WriteLine("Removing key [{0}] from metadata fields to extract", metadataFieldToIgnore); try { metadataFields.Remove(metadataFieldToIgnore); } catch { //swallow exceptions - where fields we want to remove may not exist / theres a better way to do this altogether. } } metadataFields.Add("SPWebUrl", SPWebUrl); metadataFields.Add("createdAuthorDisplayName", createdAuthorDisplayName); // Get the download URL. This URL is preauthenticated and has a short TTL. object downloadUrl; driveItemInfo.AdditionalData.TryGetValue("@microsoft.graph.downloadUrl", out downloadUrl); long size = (long)driveItemInfo.Size; Console.WriteLine("located file {0}, full url [{1}]", baseFileName, downloadUrl.ToString()); //await DownloadFileLocal(graphClient, downloadUrl, fileName); if (metadataJSONStore.Equals("True")) { //Metadata JSON logic using (var metadataJson = GenerateJsonMetadataFile(metadataFields)) { var uploadUri = await AzureBLOBStorage.UploadFileToAzureBLOB(metadataJson, jsonMetadataFileName, container); //External JSON file approach await AzureBLOBStorage.DownloadFileToAzureBLOB(graphClient, downloadUrl, baseFileName, container, uploadUri); } } else { //BLOB metadata approach await AzureBLOBStorage.DownloadFileToAzureBLOB(graphClient, downloadUrl, baseFileName, container, metadataFields); } //Persist the itemId and url to Storage Table SpoItem spoItemEntity = new SpoItem(item.Id, SPWebUrl); azTableStorage.InsertSpoItemEntity(spoItemEntity); } } }