/// <summary> /// Used when entering a new directory /// </summary> /// \todo Protect the non-first iterations + comments private void recursiveExploration(string _realPath, FileExplorationType _type = FileExplorationType.Full_And_File_Contents, string _parentID = "0", bool _firstIteration = true) { database = DB.getInstance(); if (_firstIteration) { //check if the path is reachable if (Directory.Exists(_realPath)) { //make sure all the folders are set up to this point _parentID = createRouteToPath(_realPath); } else { throw new DriveNotFoundException(); } } string[] folderPaths = _realPath.Split(new char[] { '/', '\\' }); string folderName = folderPaths[folderPaths.Length - 1]; //check that the folder name is correct. if not, do not do further operations in it to avoid weird things caused by the Directory.GetDirectories() if (!UNAUTHORISED_CHAR_IN_FILES_REGEX.IsMatch(folderName) && !UNAUTHORISED_CHAR_IN_FOLDER_REGEX.IsMatch(folderName)) { //Add the folder to the DB string folderID; if (!_firstIteration) { //don't add it to the first iteration folderID = database.addFolder(folderName, _parentID, FileOrigin.Local); } else { //else take the id of the parent folderID = _parentID; } //Recursively call all the child directories try { foreach (string childPath in Directory.GetDirectories(_realPath)) { recursiveExploration(childPath, _type, folderID, false); } } catch (DirectoryNotFoundException e) { Debug.Print("Folder Error"); } //Call all the files to be added foreach (string filePath in Directory.GetFiles(_realPath)) { //Check if we need to read the file content or no if (_type == FileExplorationType.Full_And_File_Contents) { addFile(filePath, _realPath, folderID, true); } else { addFile(filePath, _realPath, folderID, false); } } } else { Debug.WriteLine("Error : Folder with name not supported : " + _realPath); } }
/// <summary> /// Start the indexation of the website applying the mode inputed /// </summary> /// \todo Handle the type + maybe create more folders /// \todo Handle website not reachable /// <param name="_url">The url of the website from which get the content </param> /// <param name="_type">The type of exploration wanted</param> public void index(string _url, FileExplorationType _type) { //Check if the URL is valid //http://stackoverflow.com/a/5926605 bool urlIsCorrect = false; WebRequest webRequest = WebRequest.Create(_url); WebResponse webResponse; try { webResponse = webRequest.GetResponse(); urlIsCorrect = true; } catch { } //If the page can be reach if (urlIsCorrect) { //make sure there's a / at the end of the link if (_url.Split('/').Last() != "") { _url += '/'; } //add the root folder to the DB database = DB.getInstance(); string idParentFolder = database.addFolder(_url, "0", FileOrigin.Web); //Start the indexation #region Indexation //Initialise the web client WebClient wc = new WebClient(); //Create an hashset used to explore all the links between the pages, and add the current page to it HashSet <string> linksHashSet = new HashSet <string>(); linksHashSet.Add(_url); string domain = DOMAIN_FINDER.Match(_url).Groups[1].Value; string filTextContent = ""; //Loop until the end of the hashset for (int i = 0; i < linksHashSet.Count; i++) { //check if this is a readablepage or not if (READABLE_EXTENSION.IsMatch(linksHashSet.ElementAt(i))) { //try to read the content of the page try { filTextContent = wc.DownloadString(linksHashSet.ElementAt(i)); } catch { filTextContent = ""; } //Parse only the text, and fill the hashet with links if the page contains text if (filTextContent != "") { filTextContent = readPageContent(domain, filTextContent, ref linksHashSet); //get the file name string[] filSplittedName = linksHashSet.ElementAt(i).Split(new char[] { '/', '.' }); //else simply add it to the file list string filName = linksHashSet.ElementAt(i); //check if this links start with a domain or no if (filName[0] != '/') { //remove the domain and stock the file name filName = DOMAIN_FINDER.Replace(filName, ""); } string filExtension = ""; filName = filName.Split(new char[] { '.' }, 2)[0]; database.addFile(filName, filExtension, idParentFolder, filTextContent); } } else { //else simply add it to the file list string filName = linksHashSet.ElementAt(i); //check if this links start with a domain or no if (filName[0] != '/') { //remove the domain and stock the file name filName = DOMAIN_FINDER.Replace(filName, ""); } //split the extension string filExtension = filName.Split(new char[] { '.' }, 2)[1]; filName = filName.Split(new char[] { '.' }, 2)[0]; database.addFile(filName, filExtension, idParentFolder); } } #endregion } else { //TODO display an error } }
/// <summary> /// Start the indexation /// </summary> /// <param name="_path">Path to index</param> /// <param name="_type">Type of exploration, by default explore all the files and their content</param> public void index(string _path, FileExplorationType _type = FileExplorationType.Full_And_File_Contents) { recursiveExploration(_path, _type); }