/// <summary> /// Constructor that creates a directory /// </summary> public static TXTCreating GetTXT(string URL) { if (creator == null) { creator = new TXTCreating(); } if (creator.VerifyURL(URL)) { TempDirectorySplit = URL.Split(':'); directoryName = TempDirectorySplit[1].Substring(2, TempDirectorySplit[1].Length - 2); Directory.CreateDirectory(directoryName); } //Reset values codeHTML = ""; nbrTooLong = 0; return(creator); }
/// <summary> /// Method which returns a list of words for every links on the website /// </summary> /// <param name="Creator">TXTCreating object which will take the HTML and returns it to the HTMLTxtAnalyser</param> /// <param name="URL">URL of the HTML to analyse</param> /// <returns>List of words</returns> public List <string> WordSearching(TXTCreating Creator, string URL) { //Variables char chrOne = ' '; char chrTwo = ' '; char chrThree = ' '; char chrFour = ' '; char chrFive = ' '; char chrSix = ' '; char chrSeven = ' '; bool takeChr = false; bool scripted = false; string word = ""; //Objects List <string> lstWords = new List <string>(); //Taking HTML and cleaning it string HTML = Creator.TakeHTML(URL); if (HTML != null) { HTML = Regex.Replace(HTML, @"\n", " "); HTML = Regex.Replace(HTML, @"\r", " "); HTML = Regex.Replace(HTML, @"\t", " "); //Loop to search for words for (int x = 0; x < HTML.Length; ++x) { char currentChr = Convert.ToChar(HTML.Substring(x, 1)); chrOne = chrTwo; chrTwo = chrThree; chrThree = chrFour; chrFour = chrFive; chrFive = chrSix; chrSix = chrSeven; chrSeven = currentChr; if (chrTwo == 's' && chrThree == 'c' && chrFour == 'r' && chrFive == 'i' && chrSix == 'p' && chrSeven == 't') { scripted = IsScript(chrOne); } if (currentChr == '>') { takeChr = true; } else if (currentChr == '<' && takeChr) { takeChr = false; } else if (takeChr && !scripted) { if (currentChr == ' ') { if (word != "") { word = word.ToLower(); lstWords.Add(word); word = ""; } } else { word += currentChr; } } } } else { lstWords = null; } return(lstWords); }
/// <summary> /// Method to search links /// </summary> /// <param name="Creator">TXTCreating object which will take the HTML and returns it to the HTMLLinkFinder</param> /// <param name="URL">URL of the HTML to analyse</param> /// <returns>Return a list of links</returns> public List <string> Finder(TXTCreating Creator, string URL) { //Variables char chrOne = ' '; char chrTwo = ' '; char chrThree = ' '; char chrFour = ' '; string Link = ""; bool takeCar = false; bool isLink = false; int chrRemaining = 0; //Objects List <string> lstLinks = new List <string>(); //Taking HTML string HTML = Creator.TakeHTML(URL); if (HTML != null) { //Loop to search for links for (int x = 0; x < HTML.Length; ++x) { chrOne = chrTwo; chrTwo = chrThree; chrThree = chrFour; chrFour = Convert.ToChar(HTML.Substring(x, 1)); if (chrFour == 'a' && chrThree == '<') { isLink = true; } else if (chrOne == '<' && chrTwo == '/' && chrThree == 'a' && chrFour == '>') { isLink = false; } if (isLink) { if (chrOne == 'h' && chrTwo == 'r' && chrThree == 'e' && chrFour == 'f') { takeCar = true; chrRemaining = 2; } else if (chrFour == '"' && chrRemaining == 0 && takeCar) { takeCar = false; lstLinks.Add(Link); Link = ""; } else if (takeCar) { if (chrRemaining != 0) { chrRemaining--; } else { Link += chrFour; } } } } } else { lstLinks = null; } return(lstLinks); }
/// <summary> /// Method who will start the analyse and control other methods /// </summary> /// <param name="URL">URL of the website to analyse</param> public void Start(string URL) { //Variables int Analysed = 0; int Created = 0; //Objects List <List <string> > TempWordLst = new List <List <string> >(); List <string> lstLinks = new List <string>(); List <string> lstLinksName = new List <string>(); List <string> TempLst = new List <string>(); TXTCreating Creator = TXTCreating.GetTXT(URL); HTMLTxtAnalyser Analyser = HTMLTxtAnalyser.GetHTMLTxt(); //Verify the link of the website if (Creator.VerifyURL(URL)) { //Search of links on the website HTMLLinkFinder Finder = HTMLLinkFinder.GetHTMLLink(); lstLinks = Finder.Finder(Creator, URL); foreach (string link in lstLinks) { if (link.Substring(0, 1) == "/") { TempLst = Finder.Finder(Creator, URL.Substring(0, URL.Length - 1) + link); if (TempLst != null) { lstLinks = lstLinks.Union(TempLst).ToList(); } } else if (link.Length > 5) { if (link.Substring(link.Length - 5, 5) == ".html") { TempLst = Finder.Finder(Creator, URL + link); if (TempLst != null) { lstLinks = lstLinks.Union(TempLst).ToList(); } } } else if (link.Contains(URL)) { TempLst = Finder.Finder(Creator, link); if (TempLst != null) { lstLinks = lstLinks.Union(TempLst).ToList(); } } } if (!lstLinks.Contains(URL)) { lstLinks.Add(URL); } //Search of words foreach (string link in lstLinks) { if (link.Substring(0, 1) == "/") { TempLst = Analyser.WordSearching(Creator, URL + link.Substring(1, link.Length - 1)); if (TempLst != null) { TempWordLst.Add(TempLst); lstLinksName.Add(link.Substring(1, link.Length - 1)); } } else if (link.Length > 5) { if (link.Substring(link.Length - 5, 5) == ".html") { TempLst = Finder.Finder(Creator, URL + link); if (TempLst != null) { TempWordLst.Add(TempLst); lstLinksName.Add(link); } } } else if (link.Contains(URL)) { TempLst = Analyser.WordSearching(Creator, link); if (TempLst != null) { TempWordLst.Add(TempLst); lstLinksName.Add("Base"); } } Analysed++; Console.SetCursorPosition(0, 5); Console.WriteLine("Analysed : " + Analysed * 100 / lstLinks.Count + "%"); } //Text creation for (int x = 0; x < lstLinksName.Count(); ++x) { Creator.CreateTXT(lstLinksName[x], TempWordLst[x]); Created++; Console.SetCursorPosition(0, 6); Console.WriteLine("Created : " + Created * 100 / lstLinksName.Count + "%"); } Console.WriteLine("\n\nWebsite analysed!!!"); } }