/// <summary> /// This is called to generate the HTML table of contents when creating the website output /// </summary> /// <returns>The HTML to insert for the table of contents</returns> private string GenerateHtmlToc() { XPathDocument tocDoc; XPathNavigator navToc; XPathNodeIterator entries; Encoding enc = Encoding.Default; StringBuilder sb = new StringBuilder(2048); string content; // When reading the file, use the default encoding but detect the encoding if byte order marks are // present. content = BuildProcess.ReadWithEncoding(workingFolder + "WebTOC.xml", ref enc); using (StringReader sr = new StringReader(content)) { tocDoc = new XPathDocument(sr); } navToc = tocDoc.CreateNavigator(); // Get the TOC entries from the HelpTOC node entries = navToc.Select("HelpTOC/*"); this.AppendTocEntry(entries, sb); return(sb.ToString()); }
//===================================================================== /// <summary> /// Constructor /// </summary> /// <param name="exclusions">The file containing common word exclusions. The file should contain one /// work per line in lowercase. These words will not appear in the index.</param> /// <param name="language">The culture information</param> public FullTextIndex(string exclusions, CultureInfo language) { Encoding enc = Encoding.Default; string content; string[] words; if (String.IsNullOrEmpty(exclusions) || !File.Exists(exclusions)) { throw new ArgumentException("Exclusion file cannot be null or an empty string and must exist"); } content = BuildProcess.ReadWithEncoding(exclusions, ref enc); content = reCondenseWS.Replace(content, " "); lang = language; exclusionWords = new HashSet <string>(); words = reSplitWords.Split(content); foreach (string word in words) { if (word.Length > 2) { exclusionWords.Add(word); } } fileList = new List <string>(); wordDictionary = new Dictionary <string, List <long> >(); }
/// <summary> /// Open the specified collection file and return it as an /// <see cref="XmlDocument"/> ready for editing. /// </summary> /// <param name="file">The file to open</param> /// <remarks>The DTD is removed before returning it.</remarks> private static XmlDocument OpenCollectionFile(string file) { XmlDocument doc; Encoding enc = Encoding.Default; string content = BuildProcess.ReadWithEncoding(file, ref enc); // Get rid of the DTD declaration content = reRemoveDTD.Replace(content, "$1$3"); doc = new XmlDocument(); doc.LoadXml(content); return(doc); }
/// <summary> /// This is used to extract table of contents information from a file /// that will appear in the help file's table of contents. /// </summary> /// <param name="filename">The file from which to extract the /// information</param> /// <returns>The table of contents entry</returns> internal static TocEntry GetTocInfo(string filename) { TocEntry tocEntry; Encoding enc = Encoding.Default; string content; content = BuildProcess.ReadWithEncoding(filename, ref enc); tocEntry = new TocEntry(null); tocEntry.IncludePage = !reTocExclude.IsMatch(content); tocEntry.IsDefaultTopic = reIsDefaultTopic.IsMatch(content); if (reSplitToc.IsMatch(content)) { tocEntry.ApiParentMode = ApiParentMode.InsertAfter; } Match m = reSortOrder.Match(content); if (m.Success) { tocEntry.SortOrder = Convert.ToInt32(m.Groups["SortOrder"].Value, CultureInfo.InvariantCulture); } // Get the page title if possible. If not found, use the filename // without the path or extension as the page title. m = rePageTitle.Match(content); if (!m.Success) { tocEntry.Title = Path.GetFileNameWithoutExtension(filename); } else { tocEntry.Title = HttpUtility.HtmlDecode(m.Groups["Title"].Value).Replace( "\r", String.Empty).Replace("\n", String.Empty); } // Since we've got the file loaded, see if there are links // that need to be resolved when the file is copied, if it // contains <pre> blocks that should be colorized, or if it // contains tags or shared content items that need replacing. tocEntry.HasLinks = reResolveLinks.IsMatch(content); tocEntry.HasCodeBlocks = reCodeBlock.IsMatch(content); tocEntry.NeedsColorizing = reColorizeCheck.IsMatch(content); tocEntry.HasProjectTags = (reProjectTags.IsMatch(content) || reSharedContent.IsMatch(content)); return(tocEntry); }
/// <summary> /// This is used to transform a *.topic file into a *.html file using an XSLT transformation based on the /// presentation style. /// </summary> /// <param name="sourceFile">The source topic filename</param> private void XslTransform(string sourceFile) { TocEntry tocInfo; XmlReader reader = null; XmlWriter writer = null; XsltSettings settings; XmlReaderSettings readerSettings; XmlWriterSettings writerSettings; Encoding enc = Encoding.Default; FileItemCollection transforms; string content; string sourceStylesheet, destFile = Path.ChangeExtension(sourceFile, ".html"); try { readerSettings = new XmlReaderSettings(); readerSettings.CloseInput = true; readerSettings.DtdProcessing = DtdProcessing.Parse; // Create the transform on first use if (xslTransform == null) { transforms = new FileItemCollection(project, BuildAction.TopicTransform); if (transforms.Count != 0) { if (transforms.Count > 1) { this.ReportWarning("BE0011", "Multiple topic transformations found. Using '{0}'", transforms[0].FullPath); } sourceStylesheet = transforms[0].FullPath; } else { sourceStylesheet = templateFolder + project.PresentationStyle + ".xsl"; } xslStylesheet = workingFolder + Path.GetFileName(sourceStylesheet); tocInfo = BuildProcess.GetTocInfo(sourceStylesheet); // The style sheet may contain shared content items so we must resolve it this way rather // than using TransformTemplate. this.ResolveLinksAndCopy(sourceStylesheet, xslStylesheet, tocInfo); xslTransform = new XslCompiledTransform(); settings = new XsltSettings(true, true); xslArguments = new XsltArgumentList(); xslTransform.Load(XmlReader.Create(xslStylesheet, readerSettings), settings, new XmlUrlResolver()); } this.ReportProgress("Applying XSL transformation '{0}' to '{1}'.", xslStylesheet, sourceFile); reader = XmlReader.Create(sourceFile, readerSettings); writerSettings = xslTransform.OutputSettings.Clone(); writerSettings.CloseOutput = true; writerSettings.Indent = false; writer = XmlWriter.Create(destFile, writerSettings); xslArguments.Clear(); xslArguments.AddParam("pathToRoot", String.Empty, pathToRoot); xslTransform.Transform(reader, xslArguments, writer); } catch (Exception ex) { throw new BuilderException("BE0017", String.Format(CultureInfo.CurrentCulture, "Unexpected error using '{0}' to transform additional content file '{1}' to '{2}'. The " + "error is: {3}\r\n{4}", xslStylesheet, sourceFile, destFile, ex.Message, (ex.InnerException == null) ? String.Empty : ex.InnerException.Message)); } finally { if (reader != null) { reader.Close(); } if (writer != null) { writer.Flush(); writer.Close(); } } // The source topic file is deleted as the transformed file takes its place File.Delete(sourceFile); // <span> and <script> tags cannot be self-closing if empty. The template may contain them correctly // but when written out as XML, they get converted to self-closing tags which breaks them. To fix // them, convert them to full start and close tags. content = BuildProcess.ReadWithEncoding(destFile, ref enc); content = reSpanScript.Replace(content, "<$1$2></$1>"); // An XSL transform might have added tags and include items that need replacing so run it through // those options if needed. tocInfo = BuildProcess.GetTocInfo(destFile); // Expand <code> tags if necessary if (tocInfo.HasCodeBlocks) { content = reCodeBlock.Replace(content, codeBlockMatchEval); } // Colorize <pre> tags if necessary if (tocInfo.NeedsColorizing || tocInfo.HasCodeBlocks) { // Initialize code colorizer on first use if (codeColorizer == null) { codeColorizer = new CodeColorizer(ComponentUtilities.ToolsFolder + @"PresentationStyles\Colorizer\highlight.xml", ComponentUtilities.ToolsFolder + @"PresentationStyles\Colorizer\highlight.xsl"); } // Set the path the "Copy" image codeColorizer.CopyImageUrl = pathToRoot + "icons/CopyCode.gif"; // Colorize it and replace the "Copy" literal text with the shared content include item so that // it gets localized. content = codeColorizer.ProcessAndHighlightText(content); content = content.Replace(codeColorizer.CopyText + "</span", "<include item=\"copyCode\"/></span"); tocInfo.HasProjectTags = true; } // Use a regular expression to find and replace all tags with cref attributes with a link to the help // file content. This needs to happen after the code block processing as they may contain <see> tags // that need to be resolved. if (tocInfo.HasLinks || tocInfo.HasCodeBlocks) { content = reResolveLinks.Replace(content, linkMatchEval); } // Replace project option tags with project option values if (tocInfo.HasProjectTags) { // Project tags can be nested while (reProjectTags.IsMatch(content)) { content = reProjectTags.Replace(content, fieldMatchEval); } // Shared content items can be nested while (reSharedContent.IsMatch(content)) { content = reSharedContent.Replace(content, contentMatchEval); } } // Write the file back out with the appropriate encoding using (StreamWriter sw = new StreamWriter(destFile, false, enc)) { sw.Write(content); } }
/// <summary> /// This is called to load an additional content file, resolve links to namespace content and copy it to /// the output folder. /// </summary> /// <param name="sourceFile">The source filename to copy</param> /// <param name="destFile">The destination filename</param> /// <param name="entry">The entry being resolved.</param> internal void ResolveLinksAndCopy(string sourceFile, string destFile, TocEntry entry) { Encoding enc = Encoding.Default; string content, script, syntaxFile; int pos; // For topics, change the extension back to ".topic". It's ".html" in the TOC as that's what it ends // up as after transformation. if (sourceFile.EndsWith(".topic", StringComparison.OrdinalIgnoreCase)) { destFile = Path.ChangeExtension(destFile, ".topic"); } this.ReportProgress("{0} -> {1}", sourceFile, destFile); // When reading the file, use the default encoding but detect the encoding if byte order marks are // present. content = BuildProcess.ReadWithEncoding(sourceFile, ref enc); // Expand <code> tags if necessary if (entry.HasCodeBlocks) { content = reCodeBlock.Replace(content, codeBlockMatchEval); } // Colorize <pre> tags if necessary if (entry.NeedsColorizing || entry.HasCodeBlocks) { // Initialize code colorizer on first use if (codeColorizer == null) { codeColorizer = new CodeColorizer(ComponentUtilities.ToolsFolder + @"PresentationStyles\Colorizer\highlight.xml", ComponentUtilities.ToolsFolder + @"PresentationStyles\Colorizer\highlight.xsl"); } // Set the path the "Copy" image codeColorizer.CopyImageUrl = pathToRoot + "icons/CopyCode.gif"; // Colorize it and replace the "Copy" literal text with the shared content include item so that // it gets localized. content = codeColorizer.ProcessAndHighlightText(content); content = content.Replace(codeColorizer.CopyText + "</span", "<include item=\"copyCode\"/></span"); entry.HasProjectTags = true; // Add the links to the colorizer style sheet and script files unless it's going to be // transformed. In which case, the links should be in the XSL style sheet. if (!sourceFile.EndsWith(".topic", StringComparison.OrdinalIgnoreCase) && !sourceFile.EndsWith(".xsl", StringComparison.OrdinalIgnoreCase)) { script = String.Format(CultureInfo.InvariantCulture, "<link type='text/css' rel='stylesheet' href='{0}styles/highlight.css' />" + "<script type='text/javascript' src='{0}scripts/highlight_ac.js'></script>", pathToRoot); pos = content.IndexOf("</head>", StringComparison.Ordinal); // Create a <head> section if one doesn't exist if (pos == -1) { script = "<head>" + script + "</head>"; pos = content.IndexOf("<html>", StringComparison.Ordinal); if (pos != -1) { pos += 6; } else { pos = 0; } } content = content.Insert(pos, script); } // Copy the colorizer files if not already there this.EnsureOutputFoldersExist("icons"); this.EnsureOutputFoldersExist("styles"); this.EnsureOutputFoldersExist("scripts"); foreach (string baseFolder in this.HelpFormatOutputFolders) { if (!File.Exists(baseFolder + @"styles\highlight.css")) { syntaxFile = baseFolder + @"styles\highlight.css"; File.Copy(ComponentUtilities.ToolsFolder + @"PresentationStyles\Colorizer\highlight.css", syntaxFile); File.SetAttributes(syntaxFile, FileAttributes.Normal); syntaxFile = baseFolder + @"scripts\highlight_ac.js"; File.Copy(ComponentUtilities.ToolsFolder + @"PresentationStyles\Colorizer\highlight_ac.js", syntaxFile); File.SetAttributes(syntaxFile, FileAttributes.Normal); // Always copy the image files, they may be different. Also, delete the destination file // first if it exists as the filename casing may be different. syntaxFile = baseFolder + @"icons\CopyCode.gif"; if (File.Exists(syntaxFile)) { File.SetAttributes(syntaxFile, FileAttributes.Normal); File.Delete(syntaxFile); } File.Copy(ComponentUtilities.ToolsFolder + @"PresentationStyles\Colorizer\CopyCode.gif", syntaxFile); File.SetAttributes(syntaxFile, FileAttributes.Normal); syntaxFile = baseFolder + @"icons\CopyCode_h.gif"; if (File.Exists(syntaxFile)) { File.SetAttributes(syntaxFile, FileAttributes.Normal); File.Delete(syntaxFile); } File.Copy(ComponentUtilities.ToolsFolder + @"PresentationStyles\Colorizer\CopyCode_h.gif", syntaxFile); File.SetAttributes(syntaxFile, FileAttributes.Normal); } } } // Use a regular expression to find and replace all tags with cref attributes with a link to the help // file content. This needs to happen after the code block processing as they may contain <see> tags // that need to be resolved. if (entry.HasLinks || entry.HasCodeBlocks) { content = reResolveLinks.Replace(content, linkMatchEval); } // Replace project option tags with project option values if (entry.HasProjectTags) { // Project tags can be nested while (reProjectTags.IsMatch(content)) { content = reProjectTags.Replace(content, fieldMatchEval); } // Shared content items can be nested while (reSharedContent.IsMatch(content)) { content = reSharedContent.Replace(content, contentMatchEval); } } // Write the file back out with the appropriate encoding using (StreamWriter sw = new StreamWriter(destFile, false, enc)) { sw.Write(content); } // Transform .topic files into .html files if (sourceFile.EndsWith(".topic", StringComparison.OrdinalIgnoreCase)) { this.XslTransform(destFile); } }
//===================================================================== /// <summary> /// Create a full-text index from web pages found in the specified file path /// </summary> /// <param name="filePath">The path containing the files to index</param> /// <remarks>Words in the exclusion list, those that are less than three characters long, and anything /// starting with a digit will not appear in the index.</remarks> public void CreateFullTextIndex(string filePath) { Dictionary <string, int> wordCounts = new Dictionary <string, int>(); Encoding enc = Encoding.Default; Match m; string content, fileInfo, title; string[] words; int rootPathLength; if (filePath[filePath.Length - 1] == '\\') { rootPathLength = filePath.Length; } else { rootPathLength = filePath.Length + 1; } foreach (string name in Directory.EnumerateFiles(filePath, "*.htm?", SearchOption.AllDirectories)) { content = BuildProcess.ReadWithEncoding(name, ref enc); // Extract the page title m = rePageTitle.Match(content); if (!m.Success) { title = Path.GetFileNameWithoutExtension(name); } else { title = m.Groups["Title"].Value.Trim(); } // Put some space between tags content = content.Replace("><", "> <"); // Remove script, style sheet, and head blocks as they won't contain any usable keywords. Pre // tags contain code which may or may not be useful but we'll leave them alone for now. content = reStripScriptStyleHead.Replace(content, " "); // Remove all HTML tags content = reStripTags.Replace(content, " "); // Decode the text content = HttpUtility.HtmlDecode(content); // Strip apostrophe suffixes content = reStripApos.Replace(content, String.Empty); // Condense all runs of whitespace to a single space content = reCondenseWS.Replace(content, " "); // Convert to lowercase and split text on non-word boundaries words = reSplitWords.Split(content.ToLower(lang)); // We're going to use simple types for the index structure so that we don't have to deploy an // assembly to deserialize it. As such, concatenate the title, filename, and its word count // into a string separated by nulls. Note that file paths are assumed to be relative to the // root folder. fileInfo = String.Join("\x0", new string[] { title, name.Substring(rootPathLength).Replace('\\', '/'), words.Length.ToString(CultureInfo.InvariantCulture) }); wordCounts.Clear(); // Get a list of all unique words and the number of time that they appear in this file. // Exclude words that are less than three characters in length, start with a digit, or // are in the common words exclusion list. foreach (string word in words) { if (word.Length < 3 || Char.IsDigit(word[0]) || exclusionWords.Contains(word)) { continue; } // The number of times it occurs helps determine the ranking of the search results if (wordCounts.ContainsKey(word)) { wordCounts[word] += 1; } else { wordCounts.Add(word, 1); } } // Shouldn't happen but just in case, ignore files with no usable words if (wordCounts.Keys.Count != 0) { fileList.Add(fileInfo); // Add the index information to the word dictionary foreach (string word in wordCounts.Keys) { // For each unique word, we'll track the files in which it occurs and the number // of times it occurs in each file. if (!wordDictionary.ContainsKey(word)) { wordDictionary.Add(word, new List <long>()); } // Store the file index in the upper part of a 64-bit integer and the word count // in the lower 16-bits. More room is given to the file count as some builds // contain a large number of topics. wordDictionary[word].Add(((long)(fileList.Count - 1) << 16) + (long)(wordCounts[word] & 0xFFFF)); } } } }