//add <t> and <u> tags for title and url text public void AddTitleUrlTags() { string path = Helper.DOCS_PATH; string[] files = Directory.GetFiles(path); d.Doc p; int pageId; FileInfo fi; StreamWriter sw; for (int i = 0; i < files.Length; i++) { fi = new FileInfo(files[i]); pageId = Convert.ToInt32(fi.Name.Substring(0, fi.Name.IndexOf("."))); p = new d.Doc(pageId); sw = new StreamWriter(files[i], true); sw.WriteLine("<u>" + p.Url + "</u>"); sw.WriteLine("<t>" + p.Title + "</t>"); sw.Close(); Console.WriteLine(i); } }
public void Run() { UrlHelper urlHelper = new UrlHelper(); Regex regex = makeRegex(); MatchCollection mc; Regex rSpace = new Regex(@"\s"); d.DocData pd = new d.DocData(); d.LinkData ld = new d.LinkData(); Uri baseUri; Uri childUri; FileInfo fi; d.Doc p; int pageId; StreamReader sr; string html; string linkToProcess; int linkId; string linkText; string path = Helper.DOCS_PATH; string[] files = Directory.GetFiles(path); for (int i = 545; i < files.Length; i++) //545 already done { Console.WriteLine("processing file #" + i); fi = new FileInfo(files[i]); pageId = Convert.ToInt32(fi.Name.Substring(0, fi.Name.IndexOf("."))); p = new d.Doc(pageId); baseUri = new Uri(p.Url); sr = new StreamReader(fi.OpenRead()); html = sr.ReadToEnd(); mc = regex.Matches(html); Console.WriteLine("found " + mc.Count + " links"); foreach (Match m in mc) { try { childUri = new Uri(baseUri, urlHelper.MakeLink(m.Groups[3].ToString())); linkToProcess = urlHelper.NormalizeUrl(childUri.AbsoluteUri); linkText = m.Groups[4].ToString(); linkId = pd.GetIdByUrl(linkToProcess); if (linkId > 0 && linkText != "") //found page! { linkText = rSpace.Replace(linkText, " "); linkText = linkText.Trim(); linkText = linkText.Replace(" ", " "); linkText = linkText.Replace(" ", " "); linkText = linkText.Replace(" ", " "); linkText = linkText.Replace(" ", " "); linkText = linkText.Replace(" ", " "); linkText = linkText.Replace(" ", " "); linkText = linkText.Replace(" ", " "); linkText = linkText.Replace(" ", " "); linkText = linkText.Replace(" ", " "); ld.UpdateText(pageId, linkId, linkText); } } catch (Exception) {} } //if (i % 100 == 0) // Console.WriteLine("processing file #" + i); } }
private void BindTitle() { d.Doc doc = new d.Doc(getId()); lblUrl.Text = "<a href='" + doc.Url + "' target='_blank'>" + doc.Url + "</a> (" + doc.Title + ")"; }