public ParatextTextCorpus(ITokenizer <string, int> wordTokenizer, string projectDir) { string settingsFileName = Path.Combine(projectDir, "Settings.xml"); var settingsDoc = XDocument.Load(settingsFileName); var codePage = (int?)settingsDoc.Root.Element("Encoding") ?? 65001; EncodingInfo encodingInfo = Encoding.GetEncodings().FirstOrDefault(ei => ei.CodePage == codePage); if (encodingInfo == null) { throw new InvalidOperationException("The Paratext project contains an unknown encoding."); } Encoding encoding = encodingInfo.GetEncoding(); var scrVersType = (int?)settingsDoc.Root.Element("Versification") ?? (int)ScrVersType.English; Versification = new ScrVers((ScrVersType)scrVersType); var stylesheetName = (string)settingsDoc.Root.Element("StyleSheet") ?? "usfm.sty"; string stylesheetFileName = Path.Combine(projectDir, stylesheetName); var stylesheet = new UsfmStylesheet(stylesheetFileName); foreach (string sfmFileName in Directory.EnumerateFiles(projectDir, "*.SFM")) { AddText(new UsfmFileText(wordTokenizer, stylesheet, encoding, sfmFileName, Versification)); } }
protected UsfmTextBase(ITokenizer <string, int> wordTokenizer, string id, UsfmStylesheet stylesheet, Encoding encoding, ScrVers versification) : base(wordTokenizer, id) { _parser = new UsfmParser(stylesheet); _encoding = encoding; Versification = versification ?? ScrVers.English; }
public UsfmFileTextCorpus(ITokenizer <string, int> wordTokenizer, string stylesheetFileName, Encoding encoding, string projectPath, ScrVers versification = null) { Versification = versification ?? ScrVers.English; var stylesheet = new UsfmStylesheet(stylesheetFileName); foreach (string sfmFileName in Directory.EnumerateFiles(projectPath, "*.SFM")) { AddText(new UsfmFileText(wordTokenizer, stylesheet, encoding, sfmFileName, Versification)); } }
public UsfmParser(UsfmStylesheet stylesheet) { _stylesheet = stylesheet; }
public UsfmFileText(ITokenizer <string, int> wordTokenizer, UsfmStylesheet stylesheet, Encoding encoding, string fileName, ScrVers versification = null) : base(wordTokenizer, GetId(fileName, encoding), stylesheet, encoding, versification) { _fileName = fileName; }