Example #1
0
        public ParatextTextCorpus(ITokenizer <string, int> wordTokenizer, string projectDir)
        {
            string       settingsFileName = Path.Combine(projectDir, "Settings.xml");
            var          settingsDoc      = XDocument.Load(settingsFileName);
            var          codePage         = (int?)settingsDoc.Root.Element("Encoding") ?? 65001;
            EncodingInfo encodingInfo     = Encoding.GetEncodings().FirstOrDefault(ei => ei.CodePage == codePage);

            if (encodingInfo == null)
            {
                throw new InvalidOperationException("The Paratext project contains an unknown encoding.");
            }
            Encoding encoding = encodingInfo.GetEncoding();

            var scrVersType = (int?)settingsDoc.Root.Element("Versification") ?? (int)ScrVersType.English;

            Versification = new ScrVers((ScrVersType)scrVersType);

            var    stylesheetName     = (string)settingsDoc.Root.Element("StyleSheet") ?? "usfm.sty";
            string stylesheetFileName = Path.Combine(projectDir, stylesheetName);
            var    stylesheet         = new UsfmStylesheet(stylesheetFileName);

            foreach (string sfmFileName in Directory.EnumerateFiles(projectDir, "*.SFM"))
            {
                AddText(new UsfmFileText(wordTokenizer, stylesheet, encoding, sfmFileName, Versification));
            }
        }
Example #2
0
 protected UsfmTextBase(ITokenizer <string, int> wordTokenizer, string id, UsfmStylesheet stylesheet,
                        Encoding encoding, ScrVers versification)
     : base(wordTokenizer, id)
 {
     _parser       = new UsfmParser(stylesheet);
     _encoding     = encoding;
     Versification = versification ?? ScrVers.English;
 }
Example #3
0
        public UsfmFileTextCorpus(ITokenizer <string, int> wordTokenizer, string stylesheetFileName, Encoding encoding,
                                  string projectPath, ScrVers versification = null)
        {
            Versification = versification ?? ScrVers.English;
            var stylesheet = new UsfmStylesheet(stylesheetFileName);

            foreach (string sfmFileName in Directory.EnumerateFiles(projectPath, "*.SFM"))
            {
                AddText(new UsfmFileText(wordTokenizer, stylesheet, encoding, sfmFileName, Versification));
            }
        }
Example #4
0
 public UsfmParser(UsfmStylesheet stylesheet)
 {
     _stylesheet = stylesheet;
 }
Example #5
0
 public UsfmFileText(ITokenizer <string, int> wordTokenizer, UsfmStylesheet stylesheet,
                     Encoding encoding, string fileName, ScrVers versification = null)
     : base(wordTokenizer, GetId(fileName, encoding), stylesheet, encoding, versification)
 {
     _fileName = fileName;
 }