public bool ConvertUsfxToSile(string usfxFileName, string sileDirectory) { bool result = false; sileDir = sileDirectory; usfx = new XmlTextReader(usfxFileName); usfx.WhitespaceHandling = WhitespaceHandling.All; while (usfx.Read()) { Logit.ShowStatus("converting to SILE " + cv); if (usfx.NodeType == XmlNodeType.Element) { level = usfx.GetAttribute("level"); style = usfx.GetAttribute("style"); sfm = usfx.GetAttribute("sfm"); caller = usfx.GetAttribute("caller"); id = usfx.GetAttribute("id"); switch (usfx.Name) { case "languageCode": SkipElement(); break; case "book": currentBookHeader = currentBookTitle = String.Empty; toc1 = toc2 = toc3 = String.Empty; inToc1 = inToc2 = inToc3 = false; currentChapter = currentChapterPublished = currentChapterAlternate = String.Empty; currentVerse = currentVersePublished = currentVerseAlternate = String.Empty; titleWritten = false; chapterWritten = false; if (id.Length > 2) { currentBookAbbrev = id; bookRecord = (BibleBookRecord)bookInfo.books[currentBookAbbrev]; } if ((bookRecord == null) || (id.Length <= 2)) { Logit.WriteError("Cannot process unknown book: " + currentBookAbbrev); return(false); } if ((bookRecord.testament == "a") && !globe.projectOptions.includeApocrypha) { SkipElement(); } else if (!globe.projectOptions.allowedBookList.Contains(bookRecord.tla)) // Check for presence of book in bookorder.txt { SkipElement(); } else { // We have a book we want to write out. OpenSileFile(); } break; case "fe": // End note. Rarely used, fortunately, but in the standards. Treat as regular footnote. case "f": // footnote if (!usfx.IsEmptyElement) { if (caller == "-") { caller = String.Empty; } else if ((caller == "+") || (String.IsNullOrEmpty(caller))) { caller = footnoteMark.Marker(); } sileFile.WriteStartElement("f"); sileFile.WriteAttributeString("caller", caller); } break; case "x": // Cross references if (!usfx.IsEmptyElement) { if (caller == "-") { caller = String.Empty; } else if ((caller == "+") || (String.IsNullOrEmpty(caller))) { caller = xrefMark.Marker(); } sileFile.WriteStartElement("x"); sileFile.WriteAttributeString("caller", caller); } break; case "ide": case "fm": // Should not actually be in any field texts. Safe to skip. case "idx": // Peripherals - Back Matter Index SkipElement(); break; case "ie": // Introduction end SkipElement(); break; case "id": if (id != currentBookAbbrev) { Logit.WriteError("Book ID in <id> and <book> do not match: " + currentBookAbbrev + " is not " + id); } SkipElement(); // Strip out comment portion. break; case "toc": // Table of Contents entries if (String.IsNullOrEmpty(level) || (level == "1")) { inToc1 = true; } else if (level == "2") { inToc2 = true; } else if (level == "3") { inToc3 = true; } else { SkipElement(); } break; case "rem": // Comment; not part of the actual text SkipElement(); break; case "h": currentBookHeader = ReadElementText().Trim(); break; case "c": currentChapter = id; currentChapterPublished = fileHelper.LocalizeDigits(currentChapter); currentChapterAlternate = String.Empty; currentVerse = currentVersePublished = currentVerseAlternate = String.Empty; currentChapterPublished = chapterLabel + fileHelper.LocalizeDigits(ReadElementText().Trim()); chapterWritten = false; break; case "cl": if (currentChapter == String.Empty) { chapterLabel = ReadElementText().Trim() + " "; } else { currentChapterPublished = ReadElementText().Trim(); } break; case "cp": if (!usfx.IsEmptyElement) { currentChapterPublished = ReadElementText().Trim(); } break; case "v": PrintChapter(); currentVersePublished = fileHelper.LocalizeDigits(id); currentVerse = id.Replace("\u200F", ""); // Strip out RTL character currentVerseAlternate = ""; if (!usfx.IsEmptyElement) { usfx.Read(); if (usfx.NodeType == XmlNodeType.Text) { currentVersePublished = fileHelper.LocalizeDigits(usfx.Value.Trim()); } if (usfx.NodeType != XmlNodeType.EndElement) { usfx.Read(); } } break; case "va": // Not supported by The Sword Project SkipElement(); break; case "vp": SkipElement(); /* This feature is not supported by The Sword Project. * if (!usfx.IsEmptyElement) * { * usfx.Read(); * if (usfx.NodeType == XmlNodeType.Text) * { * currentVersePublished = usfx.Value.Trim(); * if (currentVersePublished.Length > 0) * { * vpeID = StartId(); * StartMosisElement("verse"); * mosis.WriteAttributeString("osisID", osisVerseId); * mosis.WriteAttributeString("sID", verseeID); * mosis.WriteAttributeString("n", currentVersePublished); * WriteMosisEndElement(); // verse * } * } * } */ break; case "periph": SkipElement(); break; case "cs": // Rare or new character style: don't know what it should be, so throw away tag & keep text. break; case "gw": // Do nothing. Not sure what to do with glossary words, yet. case "xt": // Do nothing. case "ft": // Ignore. It does nothing useful, but is an artifact of USFM exclusive character styles. break; case "usfx": // Nothing to do, here. break; case "dc": case "xdc": case "fdc": if (!globe.projectOptions.includeApocrypha) { SkipElement(); } break; default: sileFile.WriteStartElement(usfx.Name); if (id != null) { sileFile.WriteAttributeString("id", id); } if (caller != null) { sileFile.WriteAttributeString("caller", caller); } if (level != null) { sileFile.WriteAttributeString("level", level); } if (sfm != null) { sileFile.WriteAttributeString("sfm", sfm); } if (style != null) { sileFile.WriteAttributeString("style", style); } if (usfx.IsEmptyElement) { sileFile.WriteEndElement(); } break; } } else if (usfx.NodeType == XmlNodeType.EndElement) { if (inToc1 || inToc2) { if (usfx.Name == "toc") { inToc2 = inToc1 = false; } else if (inToc1 && usfx.Name == "it") { toc1 += "</hi></seg>"; } else { Logit.WriteLine("Warning: " + usfx.Name + " end markup in title at " + currentBookAbbrev + " not written to OSIS file"); } } else { switch (usfx.Name) { case "w": case "zw": if (inStrongs) { WriteMosisEndElement(); inStrongs = false; } break; case "wj": WriteMosisEndElement(); // q break; case "book": EndLineGroup(); EndCurrentVerse(); EndCurrentChapter(); EndIntroduction(); EndMajorSection(); WriteMosisEndElement(); // div type="book" CheckElementLevel(3, "closed book"); break; case "bdit": WriteMosisEndElement(); // hi italic WriteMosisEndElement(); // hi bold break; case "p": if (itemLevel > 0) { itemLevel--; } CheckMinimumLevel(5, "Ending " + usfx.Name + " " + osisVerseId); inNote = false; if (eatPoetryLineEnd) { eatPoetryLineEnd = false; } else { WriteMosisEndElement(); } break; case "q": if (eatPoetryLineEnd) { eatPoetryLineEnd = false; } else { WriteMosisEndElement(); } break; case "ref": if (inReference) { WriteMosisEndElement(); // reference inReference = false; } break; case "fe": case "f": case "x": if (inNote) { inNote = false; WriteMosisEndElement(); // End of note } break; case "add": if (!inNote) { WriteMosisEndElement(); } break; case "qs": if (inLineGroup) { WriteMosisEndElement(); inPoetryLine = false; } break; case "bd": case "bk": case "cl": case "d": case "dc": case "em": case "fk": case "fp": case "fq": case "fqa": case "fr": case "fv": case "k": case "no": case "pn": case "qac": case "qt": case "r": case "rq": case "s": case "sc": case "sig": case "sls": case "table": case "tc": case "tcr": case "th": case "thr": case "tl": case "tr": case "xo": case "ord": // case "xq": Not useful for Sword modules. WriteMosisEndElement(); // note, hi, reference, title, l, transChange, etc. break; case "it": if (!inStrongs) { WriteMosisEndElement(); } break; case "nd": WriteMosisEndElement(); // divineName WriteMosisEndElement(); // seg break; case "xk": case "fl": case "zcr": case "zcb": case "zcg": case "zcy": // not supported. break; /* Can't get to this case (caught in "if" above) * case "toc": * inToc2 = inToc1 = false; * break;*/ } } } else if (((usfx.NodeType == XmlNodeType.Text) || (usfx.NodeType == XmlNodeType.SignificantWhitespace) || (usfx.NodeType == XmlNodeType.Whitespace)) && !ignore) { if (inToc1) { toc1 = toc1 + usfx.Value; } else if (inToc2) { toc2 = toc2 + usfx.Value; } else { mosis.WriteString(usfx.Value); } } } return(result); }
/// <summary> /// Create an index of Strong's numbers (corresponding to the lemma or root word lexicon entry number). /// NOTE: Call MakeJsonIndex immediately before calling MakeLemmaIndex. /// </summary> /// <param name="lemmaTextFile"></param> /// <param name="lemmaDir"></param> public void MakeLemmaIndex(string lemmaTextFile, string lemmaDir) { string oneWord; string bookCode; searchTextXml = new XmlTextReader(lemmaTextFile); wordCollection = new Hashtable(19999); StreamWriter[] lemmaFiles; int i, j, lineLength; char ch; char defaultSourceLanguage = 'H'; Utils.EnsureDirectory(lemmaDir); BibleBookRecord br; try { // Read all references to Strong's numbers into wordCollection hash table. while (searchTextXml.Read()) { if ((searchTextXml.NodeType == XmlNodeType.Element) && (searchTextXml.Name == "v")) { bookCode = fileHelper.GetNamedAttribute(searchTextXml, "b"); currentBook = bookInfo.getShortCode(bookCode); br = (BibleBookRecord)bookInfo.books[bookCode]; if (br.testament == "o") { defaultSourceLanguage = 'H'; } else { defaultSourceLanguage = 'G'; } currentChapter = fileHelper.GetNamedAttribute(searchTextXml, "c"); startVerse = currentVerse = fileHelper.GetNamedAttribute(searchTextXml, "v"); // Verse numbers might be verse bridges, like "20-22" or simple numbers, like "20". i = currentVerse.IndexOf('-'); if (i > 0) { startVerse = startVerse.Substring(0, i); } verseID = currentBook + currentChapter + "_" + startVerse; if (!Logit.ShowStatus("Creating lemma index " + verseID)) { searchTextXml.Close(); return; } searchTextXml.Read(); if (includedVerses.Contains(verseID) && (searchTextXml.NodeType == XmlNodeType.Text)) { string s = searchTextXml.Value; for (i = 0; i < s.Length; i++) { if (!Char.IsWhiteSpace(s[i])) { if (word.Length == 0) { if (Char.IsDigit(s[i])) { word.Append(defaultSourceLanguage); } } word.Append(s[i]); } else { AddWordToLemma(); } } AddWordToLemma(); } } } searchTextXml.Close(); // Write search index with fewer files. bool[] commaNeeded = new bool[LEMMASIZE]; // Boolean variables are created with value "false" lemmaFiles = new StreamWriter[LEMMASIZE]; char srcLang = 'G'; for (i = 0, j = 0; i < LEMMASIZE; i++, j++) { if (i == 6) { srcLang = 'H'; j = 0; } lemmaFiles[i] = new StreamWriter(Path.Combine(lemmaDir, "_" + srcLang + j.ToString() + "000.json"), false, Encoding.UTF8); lemmaFiles[i].Write("{\n"); } // Also write combined search index for web server use //wordLocationFile = new StreamWriter(Path.Combine(searchDir, "search.json")); //wordLocationFile.WriteLine("{"); foreach (DictionaryEntry de in wordCollection) { oneWord = (string)de.Key; int hash = HashLemma(oneWord); string longString = (string)de.Value; sqlConcordance.WriteLine("INSERT INTO {0} VALUES (\"{1}\",\"{2}\");", concTableName, oneWord, longString); StringBuilder sb = new StringBuilder(); lineLength = 26 + oneWord.Length; for (i = 0; i < longString.Length; i++) { ch = longString[i]; if (ch == ',') { sb.Append("\","); lineLength += 2; if (lineLength > 100) { sb.Append("\n"); lineLength = 0; } sb.Append("\""); lineLength++; } else { sb.Append(ch); lineLength++; } } if (commaNeeded[hash]) { lemmaFiles[hash].Write(",\n"); } lemmaFiles[hash].Write("\"{0}\":[\"{1}\"]", oneWord, sb.ToString()); commaNeeded[hash] = true; if (!Logit.ShowStatus("Writing lemma index " + oneWord)) { return; } } for (i = 0; i < LEMMASIZE; i++) { lemmaFiles[i].Write("}\n"); lemmaFiles[i].Close(); } sqlConcordance.WriteLine("UNLOCK TABLES;"); sqlConcordance.Close(); } catch (Exception ex) { Logit.WriteError(ex.Message); } }
/// <summary> /// Create an index file to speed searches in Browser Bible /// </summary> /// <param name="verseTextFile">Name of XML file with normalized search text by verse.</param> /// <param name="searchDir">Name of directory to write search files into.</param> /// <parame name="sqlFile">Name of the SQL file to create.</parame> public void MakeJsonIndex(string verseTextFile, string searchDir, string sqlFile) { string oneWord; searchTextXml = new XmlTextReader(verseTextFile); wordCollection = new Hashtable(400009); //StreamWriter wordLocationFile; StreamWriter[] wordFiles; base32string b32 = new base32string(); int i, lineLength; char ch; Utils.EnsureDirectory(searchDir); includedVerses = new HashSet <string>(); sqlConcordance = new StreamWriter(sqlFile, false, Encoding.UTF8); concTableName = Path.GetFileNameWithoutExtension(sqlFile); // Write SQL file preamble sqlConcordance.WriteLine(@"USE sofia; DROP TABLE IF EXISTS sofia.{0}; CREATE TABLE {0} ( keyWord VARCHAR(128) COLLATE UTF8_GENERAL_CI NOT NULL, verseList TEXT NOT NULL) ENGINE=MyISAM; LOCK TABLES {0} WRITE;", concTableName); // Read the verse list while (searchTextXml.Read()) { if ((searchTextXml.NodeType == XmlNodeType.Element) && (searchTextXml.Name == "v")) { currentBook = bookInfo.getShortCode(fileHelper.GetNamedAttribute(searchTextXml, "b")); currentChapter = fileHelper.GetNamedAttribute(searchTextXml, "c"); startVerse = currentVerse = fileHelper.GetNamedAttribute(searchTextXml, "v"); // Verse numbers might be verse bridges, like "20-22" or simple numbers, like "20". i = currentVerse.IndexOf('-'); if (i > 0) { startVerse = startVerse.Substring(0, i); } verseID = currentBook + currentChapter + "_" + startVerse; if (!Logit.ShowStatus("Creating word index " + verseID)) { searchTextXml.Close(); return; } searchTextXml.Read(); if (searchTextXml.NodeType == XmlNodeType.Text) { if (searchTextXml.Value.Trim().Length > 0) { includedVerses.Add(verseID); } IndexWords(searchTextXml.Value); } } } searchTextXml.Close(); // Write search index with fewer files. bool[] commaNeeded = new bool[HASHSIZE]; //bool needComma = false; wordFiles = new StreamWriter[HASHSIZE]; for (i = 0; i < HASHSIZE; i++) { wordFiles[i] = new StreamWriter(Path.Combine(searchDir, "_" + i.ToString() + ".json")); wordFiles[i].Write("{\n"); } foreach (DictionaryEntry de in wordCollection) { oneWord = (string)de.Key; if (oneWord.Length > 0) { int hash = HashWord(oneWord); string longString = (string)de.Value; sqlConcordance.WriteLine("INSERT INTO {0} VALUES (\"{1}\",\"{2}\");", concTableName, oneWord, longString); StringBuilder sb = new StringBuilder(); lineLength = 26 + oneWord.Length; for (i = 0; i < longString.Length; i++) { ch = longString[i]; if (ch == ',') { sb.Append("\","); lineLength += 2; if (lineLength > 100) { sb.Append("\n"); lineLength = 0; } sb.Append("\""); lineLength++; } else { sb.Append(ch); lineLength++; } } if (Char.IsLetter(oneWord[0])) { if (commaNeeded[hash]) { wordFiles[hash].Write(",\n"); } wordFiles[hash].Write("\"{0}\":[\"{1}\"]", oneWord, sb.ToString()); commaNeeded[hash] = true; } if (!Logit.ShowStatus("Writing word index " + oneWord)) { return; } } } for (i = 0; i < HASHSIZE; i++) { wordFiles[i].Write("}\n"); wordFiles[i].Close(); } }
/// <summary> /// Reads a USFX file and prepares it for full text search (or concordance generation) /// by extracting only the canonical text within verses (and the canonical Psalm titles, /// which are prepended to verse 1 text), stripping out all formatting, footnotes, etc., /// and normalizing all white space to single spaces. These verse text strings are put /// into an XML file with one "v" element per verse, with book, chapter, and verse given /// in attributes b, c, and v, respectively. /// </summary> /// <param name="usfxFileName">Name of the USFX file to extract canonical text from</param> /// <param name="verseFileName">Name of XML unformatted verse text only file</param> /// <returns></returns> public bool Filter(string usfxFileName, string verseFileName) { string level = String.Empty; string style = String.Empty; string sfm = String.Empty; string caller = String.Empty; string id = String.Empty; string strongs = String.Empty; verseText = new StringBuilder(); lemmaText = new StringBuilder(); bool result = false; try { utf8encoding = new UTF8Encoding(false); vplFile = new StreamWriter(Path.ChangeExtension(verseFileName, ".vpltxt"), false, utf8encoding); lemmaFile = new XmlTextWriter(Path.ChangeExtension(verseFileName, ".lemma"), utf8encoding); lemmaFile.Formatting = Formatting.Indented; lemmaFile.WriteStartDocument(); lemmaFile.WriteStartElement("lemmaFile"); usfx = new XmlTextReader(usfxFileName); usfx.WhitespaceHandling = WhitespaceHandling.All; verseFile = new XmlTextWriter(verseFileName, utf8encoding); verseFile.Formatting = Formatting.Indented; verseFile.WriteStartDocument(); verseFile.WriteStartElement("verseFile"); while (usfx.Read()) { if (!Logit.ShowStatus("extracting search text " + currentPlace)) { return(false); } if (usfx.NodeType == XmlNodeType.Element) { level = fileHelper.GetNamedAttribute(usfx, "level"); style = fileHelper.GetNamedAttribute(usfx, "style"); sfm = fileHelper.GetNamedAttribute(usfx, "sfm"); caller = fileHelper.GetNamedAttribute(usfx, "caller"); id = fileHelper.GetNamedAttribute(usfx, "id"); switch (usfx.Name) { case "book": currentChapter = String.Empty; currentVerse = String.Empty; if (id.Length == 3) { currentBook = id; bookRecord = (BibleBookRecord)bookInfo.books[currentBook]; osisBook = bookRecord.osisName; BibleWorksBook = bookRecord.bibleworksCode; } if ((bookRecord == null) || (id.Length != 3)) { Logit.WriteError("Cannot process unknown book: " + currentBook); SkipElement(); } if (bookRecord.testament == "x") { // Skip peripherals. SkipElement(); } currentPlace = currentBook; break; case "id": if (id != currentBook) { Logit.WriteError("Book ID in <id> and <book> do not match; " + currentBook + " is not " + id); } SkipElement(); // Strip out comment portion. break; case "h": usfx.Read(); if (usfx.NodeType == XmlNodeType.Text) { bookRecord.vernacularShortName = usfx.Value.Trim(); } break; case "toc": usfx.Read(); if (usfx.NodeType == XmlNodeType.Text) { if (level == "1") { bookRecord.vernacularLongName = usfx.Value.Trim(); } else if (level == "2") { string sn = usfx.Value.Trim(); if ((bookRecord.vernacularShortName.Length < 2) || (sn.Length < bookRecord.vernacularShortName.Length)) { bookRecord.vernacularShortName = sn; } } } break; case "c": EndVerse(); // In case file lacks <ve /> elements. currentChapter = id; currentVerse = String.Empty; currentPlace = currentBook + "_" + currentChapter; SkipElement(); // Doesn't skip chapter, just the published chapter number, if present. break; case "v": EndVerse(); // In case file lacks <ve /> elements. inVerse = true; currentVerse = id; currentPlace = currentBook + "_" + currentChapter + "_" + currentVerse; SkipElement(); // Just in case there is a published verse number present. break; case "ve": EndVerse(); break; case "b": // blank line case "optionalLineBreak": case "qs": case "th": case "thr": case "tc": case "tcr": if (inVerse) { verseText.Append(' '); } break; case "d": // Make canonical psalm titles searchable inPsalmTitle = true; break; case "add": verseText.Append("["); break; case "nd": //verseText.Append("{"); break; case "languageCode": case "f": // footnote case "fe": // End note. Rarely used, fortunately, but in the standards. case "x": // Cross references case "glo": case "ide": case "fig": // figure case "fdc": case "fm": // Should not actually be in any field texts. Safe to skip. case "idx": // Peripherals - Back Matter Index case "ie": // Introduction end case "iex": // Introduction explanatory or bridge text case "fp": case "rem": // Comment; not part of the actual text case "cl": case "ca": case "vp": case "periph": case "milestone": case "rq": case "s": SkipElement(); break; case "w": strongs = fileHelper.GetNamedAttribute(usfx, "s"); if (!String.IsNullOrEmpty(strongs)) { lemmaText.Append(strongs + " "); } break; case "p": if (sfm.StartsWith("i")) { SkipElement(); } else { switch (sfm) { case "cd": case "intro": case "hr": // Horizontal rule not supported. Try a line break. case "ib": case "im": case "imq": case "imi": case "ip": case "ipi": case "ipq": case "ipr": case "mt": case "keyword": case "iq": case "imte": case "imt": case "is": case "iot": case "ior": case "io": case "ili": case "r": SkipElement(); break; } } break; } } else if (usfx.NodeType == XmlNodeType.EndElement) { switch (usfx.Name) { case "book": EndVerse(); // In case file lacks <ve /> elements. currentBook = currentChapter = currentVerse = String.Empty; break; case "d": inPsalmTitle = false; break; case "add": verseText.Append("]"); break; case "nd": // verseText.Append("}"); break; } } else if (usfx.NodeType == XmlNodeType.Text) { if (inVerse || inPsalmTitle) { verseText.Append(usfx.Value); } } else if ((usfx.NodeType == XmlNodeType.SignificantWhitespace) || (usfx.NodeType == XmlNodeType.Whitespace)) { if (inVerse || inPsalmTitle) { verseText.Append(" "); } } } Logit.ShowStatus("writing " + verseFileName); verseFile.WriteEndElement(); // verseFile lemmaFile.WriteEndElement(); // lemmaFile verseFile.Close(); lemmaFile.Close(); vplFile.Close(); usfx.Close(); result = true; } catch (Exception ex) { Logit.WriteError(ex.Message); } return(result); }