public void Create_EmptyOps_EmptySegments() { var doc = new BsonDocument { { "_id", "abc123:MAT:1:target" }, { "ops", new BsonArray { // Empty ops array } } }; var numberOps = 0; var numberSegments = 0; var bookNumber = 40; var chapterNumber = 1; var projectId = "myProject"; Assert.That(((BsonArray)doc["ops"]).Count, Is.EqualTo(numberOps), "Setup"); var tokenizer = new LatinWordTokenizer(); // SUT var text = new SFScriptureText(tokenizer, projectId, bookNumber, chapterNumber, doc); Assert.That(text.Id, Is.EqualTo($"{projectId}_{bookNumber}_{chapterNumber}")); Assert.That(text.Segments.Count(), Is.EqualTo(numberSegments)); }
public Task <ITextCorpus> CreateAsync(IEnumerable <string> projects, TextCorpusType type) { var wordTokenizer = new LatinWordTokenizer(); var texts = new List <IText>(); foreach (string projectId in projects) { string dir = null; switch (type) { case TextCorpusType.Source: dir = "source"; break; case TextCorpusType.Target: dir = "target"; break; } foreach (string file in Directory.EnumerateFiles(Path.Combine(_textFileDir, projectId, dir), "*.txt")) { var text = new TextFileText(wordTokenizer, $"{projectId}_{Path.GetFileNameWithoutExtension(file)}", file); texts.Add(text); } } return(Task.FromResult <ITextCorpus>(new DictionaryTextCorpus(texts))); }
public void Segments_EmptyDoc() { var tokenizer = new LatinWordTokenizer(); var doc = new XDocument(new XElement("TermRenderingsList")); var text = new SFBiblicalTermsText(tokenizer, "project01", doc); Assert.That(text.Segments, Is.Empty); }
public void Texts() { var tokenizer = new LatinWordTokenizer(); var corpus = new UsfmFileTextCorpus(tokenizer, CorporaTestHelpers.UsfmStylesheetPath, Encoding.UTF8, CorporaTestHelpers.UsfmTestProjectPath); Assert.That(corpus.Texts.Select(t => t.Id), Is.EquivalentTo(new[] { "MAT", "MRK" })); }
public void Create_HasDocOps_HasSegments() { // Make a BsonDocument that looks like data // from SF DB - xforge - texts. var doc = new BsonDocument { { "_id", "abc123:MAT:1:target" }, { "ops", new BsonArray { new BsonDocument { { "insert", new BsonDocument { { "chapter", new BsonDocument { { "number", "1" }, { "style", "c" } } } } } }, new BsonDocument { { "insert", new BsonDocument { { "verse", new BsonDocument { { "number", "1" }, { "style", "v" } } } } } }, new BsonDocument { { "insert", "First verse text here" }, { "attributes", new BsonDocument { { "segment", "verse_1_1" } } } } } } }; var numberOps = 3; var numberSegments = 1; var bookNumber = 40; var chapterNumber = 1; var projectId = "myProject"; Assert.That(((BsonArray)doc["ops"]).Count, Is.EqualTo(numberOps), "Setup"); var tokenizer = new LatinWordTokenizer(); // SUT var text = new SFScriptureText(tokenizer, projectId, bookNumber, chapterNumber, doc); Assert.That(text.Id, Is.EqualTo($"{projectId}_{bookNumber}_{chapterNumber}")); Assert.That(text.Segments.Count(), Is.EqualTo(numberSegments)); }
public void Segments_Guess() { var tokenizer = new LatinWordTokenizer(); var doc = new XDocument(new XElement("TermRenderingsList", TermRendering("term1", guess: true, "Term1"), TermRendering("term2", guess: true, "Term2"))); var text = new SFBiblicalTermsText(tokenizer, "project01", doc); Assert.That(text.Segments, Is.Empty); }
public TranslationEngine(string baseUrl, string projectId, IHttpClient httpClient = null) { ProjectId = projectId; var wordTokenizer = new LatinWordTokenizer(); SourceWordTokenizer = wordTokenizer; TargetWordTokenizer = wordTokenizer; RestClient = new TranslationRestClient(baseUrl, httpClient ?? new AjaxHttpClient()); ErrorCorrectionModel = new ErrorCorrectionModel(); _cts = new CancellationTokenSource(); }
public void Segments_EmptyText() { var tokenizer = new LatinWordTokenizer(); var corpus = new UsfmFileTextCorpus(tokenizer, CorporaTestHelpers.UsfmStylesheetPath, Encoding.UTF8, CorporaTestHelpers.UsfmTestProjectPath); IText text = corpus.GetText("MRK"); TextSegment[] segments = text.Segments.ToArray(); Assert.That(segments, Is.Empty); }
public void Create_NullDoc_Crash() { BsonDocument doc = null; var bookNumber = 40; var chapterNumber = 1; var projectId = "myProject"; var tokenizer = new LatinWordTokenizer(); // SUT Assert.Throws <ArgumentNullException>(() => new SFScriptureText(tokenizer, projectId, bookNumber, chapterNumber, doc)); }
public void TryGetText() { var tokenizer = new LatinWordTokenizer(); var corpus = new UsfmFileTextCorpus(tokenizer, CorporaTestHelpers.UsfmStylesheetPath, Encoding.UTF8, CorporaTestHelpers.UsfmTestProjectPath); IText text; Assert.That(corpus.TryGetText("MAT", out text), Is.True); Assert.That(text.Id, Is.EqualTo("MAT")); Assert.That(corpus.TryGetText("LUK", out _), Is.False); }
public void Segments_Renderings() { var tokenizer = new LatinWordTokenizer(); var doc = new XDocument(new XElement("TermRenderingsList", TermRendering("term2", guess: false, "Term2"), TermRendering("term1", guess: false, "Term1"))); var text = new SFBiblicalTermsText(tokenizer, "project01", doc); TextSegment[] segments = text.Segments.ToArray(); Assert.That(segments.Length, Is.EqualTo(2)); Assert.That(segments[0].SegmentRef.ToString(), Is.EqualTo("term1")); Assert.That(string.Join(" ", segments[0].Segment), Is.EqualTo("Term1")); Assert.That(string.Join(" ", segments[1].Segment), Is.EqualTo("Term2")); }
public void Create_NoSegments_EmptySegments() { var doc = new BsonDocument { { "_id", "abc123:MAT:1:target" }, { "ops", new BsonArray { new BsonDocument { { "insert", new BsonDocument { { "chapter", new BsonDocument { { "number", "1" }, { "style", "c" } } } } } }, new BsonDocument { { "insert", new BsonDocument { { "verse", new BsonDocument { { "number", "1" }, { "style", "v" } } } } } } // No verse text inserts with a segment reference. } } }; var numberOps = 2; var numberSegments = 0; var bookNumber = 40; var chapterNumber = 1; var projectId = "myProject"; Assert.That(((BsonArray)doc["ops"]).Count, Is.EqualTo(numberOps), "Setup"); var tokenizer = new LatinWordTokenizer(); // SUT var text = new SFScriptureText(tokenizer, projectId, bookNumber, chapterNumber, doc); Assert.That(text.Id, Is.EqualTo($"{projectId}_{bookNumber}_{chapterNumber}")); Assert.That(text.Segments.Count(), Is.EqualTo(numberSegments)); }
public void Create_MissingOps_Crash() { var doc = new BsonDocument { { "_id", "abc123:MAT:1:target" }, // Missing ops }; var bookNumber = 40; var chapterNumber = 1; var projectId = "myProject"; Assert.That(doc.Contains("ops"), Is.False, "Setup"); var tokenizer = new LatinWordTokenizer(); // SUT Assert.Throws <ArgumentException>(() => new SFScriptureText(tokenizer, projectId, bookNumber, chapterNumber, doc)); }
private async Task <IReadOnlyList <IText> > CreateTextsAsync(IEnumerable <string> projects, TextCorpusType type) { StringTokenizer wordTokenizer = new LatinWordTokenizer(); IMongoDatabase database = _mongoClient.GetDatabase(_dataAccessOptions.Value.MongoDatabaseName); IMongoCollection <BsonDocument> textDataColl = database.GetCollection <BsonDocument>( SFDataAccessConstants.TextDataCollectionName); var texts = new List <IText>(); foreach (string projectId in projects) { TextType textType; switch (type) { case TextCorpusType.Source: textType = TextType.Source; break; case TextCorpusType.Target: textType = TextType.Target; break; default: throw new InvalidEnumArgumentException(nameof(type), (int)type, typeof(TextType)); } List <TextEntity> textList = await _texts.Query().Where(t => t.ProjectRef == projectId).ToListAsync(); foreach (TextEntity text in textList) { foreach (Chapter chapter in text.Chapters) { FilterDefinition <BsonDocument> filter = Builders <BsonDocument> .Filter .Eq("_id", TextEntity.GetTextDataId(text.Id, chapter.Number, textType)); BsonDocument doc = await textDataColl.Find(filter).FirstAsync(); texts.Add(new SFScriptureText(wordTokenizer, projectId, text.Id, chapter.Number, doc)); } } } return(texts); }
public void Segments_NonEmptyText() { var tokenizer = new LatinWordTokenizer(); var corpus = new UsfmFileTextCorpus(tokenizer, CorporaTestHelpers.UsfmStylesheetPath, Encoding.UTF8, CorporaTestHelpers.UsfmTestProjectPath); IText text = corpus.GetText("MAT"); TextSegment[] segments = text.Segments.ToArray(); Assert.That(segments.Length, Is.EqualTo(10)); Assert.That(segments[0].SegmentRef, Is.EqualTo(new VerseRef("MAT 1:1", corpus.Versification))); Assert.That(segments[0].Segment, Is.EqualTo("Chapter one , verse one .".Split())); Assert.That(segments[1].SegmentRef, Is.EqualTo(new VerseRef("MAT 1:2", corpus.Versification))); Assert.That(segments[1].Segment, Is.EqualTo("Chapter one , verse two .".Split())); Assert.That(segments[5].SegmentRef, Is.EqualTo(new VerseRef("MAT 2:1", corpus.Versification))); Assert.That(segments[5].Segment, Is.EqualTo("Chapter two , verse one .".Split())); Assert.That(segments[9].SegmentRef, Is.EqualTo(new VerseRef("MAT 2:5", corpus.Versification))); Assert.That(segments[9].Segment, Is.EqualTo("Chapter two , verse five .".Split())); }
private async Task <IReadOnlyList <IText> > CreateTextsAsync(IEnumerable <string> projects, TextCorpusType type) { StringTokenizer wordTokenizer = new LatinWordTokenizer(); IMongoDatabase database = _mongoClient.GetDatabase(_dataAccessOptions.Value.MongoDatabaseName); IMongoCollection <BsonDocument> textDataColl = database.GetCollection <BsonDocument>( _realtimeService.GetCollectionName <TextData>()); var texts = new List <IText>(); foreach (string projectId in projects) { var project = await _realtimeService.GetSnapshotAsync <SFProject>(projectId); if (string.IsNullOrWhiteSpace(project.TranslateConfig.Source?.ProjectRef)) { throw new DataNotFoundException("The source project reference is missing"); } string textCorpusProjectId = type switch { TextCorpusType.Source => project.TranslateConfig.Source.ProjectRef, TextCorpusType.Target => projectId, _ => throw new InvalidEnumArgumentException(nameof(type), (int)type, typeof(TextCorpusType)), }; foreach (TextInfo text in project.Texts.Where(t => t.HasSource)) { foreach (Chapter chapter in text.Chapters) { string id = TextData.GetTextDocId(textCorpusProjectId, text.BookNum, chapter.Number); FilterDefinition <BsonDocument> filter = Builders <BsonDocument> .Filter.Eq("_id", id); BsonDocument doc = await textDataColl.Find(filter).FirstOrDefaultAsync(); if (doc != null && doc.TryGetValue("ops", out BsonValue ops) && ops as BsonArray != null) { texts.Add(new SFScriptureText(wordTokenizer, projectId, text.BookNum, chapter.Number, doc)); } } } } return(texts); }
private static void Approve_Success(Assert assert) { string source = "En el principio la Palabra ya existía."; string prefix = "In the beginning the Word already existed."; MockHttpClient httpClient = CreateWebClient(); httpClient.Requests.Add(new MockRequest { Method = HttpRequestMethod.Post, Url = "translation/engines/project:project1/actions/trainSegment", Action = (body, ct) => { var segmentPair = JsonConvert.DeserializeObject <SegmentPairDto>(body, RestClientBase.SerializerSettings); var tokenizer = new LatinWordTokenizer(); assert.DeepEqual(segmentPair.SourceSegment, tokenizer.TokenizeToStrings(source).ToArray()); assert.DeepEqual(segmentPair.TargetSegment, tokenizer.TokenizeToStrings(prefix).ToArray()); return(Task.FromResult(true)); }, ResponseText = "" }); var engine = new TranslationEngine("http://localhost/", "project1", httpClient); Action done = assert.Async(); engine.TranslateInteractively(source, 0.2, session => { assert.NotEqual(session, null); session.Initialize(); session.UpdatePrefix(prefix); session.Approve(success => { assert.Ok(success); done(); }); }); }
private async Task <IReadOnlyList <IText> > CreateTextsAsync(IEnumerable <string> projects, TextCorpusType type) { StringTokenizer wordTokenizer = new LatinWordTokenizer(); IMongoDatabase database = _mongoClient.GetDatabase(_dataAccessOptions.Value.MongoDatabaseName); IMongoCollection <BsonDocument> textDataColl = database.GetCollection <BsonDocument>( _realtimeService.GetCollectionName <TextData>()); var texts = new List <IText>(); foreach (string projectId in projects) { var project = await _realtimeService.GetSnapshotAsync <SFProject>(projectId); if (string.IsNullOrWhiteSpace(project.TranslateConfig.Source?.ProjectRef)) { throw new DataNotFoundException("The source project reference is missing"); } string textCorpusProjectId; string paratextId; switch (type) { case TextCorpusType.Source: textCorpusProjectId = project.TranslateConfig.Source.ProjectRef; paratextId = project.TranslateConfig.Source.ParatextId; break; case TextCorpusType.Target: textCorpusProjectId = projectId; paratextId = project.ParatextId; break; default: throw new InvalidEnumArgumentException(nameof(type), (int)type, typeof(TextCorpusType)); } foreach (TextInfo text in project.Texts.Where(t => t.HasSource)) { foreach (Chapter chapter in text.Chapters) { string id = TextData.GetTextDocId(textCorpusProjectId, text.BookNum, chapter.Number); FilterDefinition <BsonDocument> filter = Builders <BsonDocument> .Filter.Eq("_id", id); BsonDocument doc = await textDataColl.Find(filter).FirstOrDefaultAsync(); if (doc != null && doc.TryGetValue("ops", out BsonValue ops) && ops as BsonArray != null) { texts.Add(new SFScriptureText(wordTokenizer, projectId, text.BookNum, chapter.Number, doc)); } } } string termRenderingsFileName = Path.Combine(_siteOptions.Value.SiteDir, "sync", paratextId, "target", "TermRenderings.xml"); if (_fileSystemService.FileExists(termRenderingsFileName)) { using var stream = _fileSystemService.OpenFile(termRenderingsFileName, FileMode.Open); XDocument termRenderingsDoc = await XDocument.LoadAsync(stream, LoadOptions.None, CancellationToken.None); texts.Add(new SFBiblicalTermsText(wordTokenizer, projectId, termRenderingsDoc)); } } return(texts); }
private async Task <IReadOnlyList <IText> > CreateTextsAsync(IEnumerable <string> projects, TextCorpusType type) { StringTokenizer wordTokenizer = new LatinWordTokenizer(); IMongoDatabase sfDatabase = _mongoClient.GetDatabase("scriptureforge"); IMongoDatabase realtimeDatabase = _mongoClient.GetDatabase("realtime"); IMongoCollection <BsonDocument> projectsColl = sfDatabase.GetCollection <BsonDocument>("projects"); var texts = new List <IText>(); foreach (string projectId in projects) { Project project = await _projectRepo.GetAsync(projectId); if (project == null) { continue; } string segmentType = null; string suffix = null; switch (type) { case TextCorpusType.Source: suffix = "source"; segmentType = project.SourceSegmentType; break; case TextCorpusType.Target: suffix = "target"; segmentType = project.TargetSegmentType; break; } StringTokenizer segmentTokenizer = null; if (segmentType != null) { segmentTokenizer = WebApiUtils.CreateSegmentTokenizer(segmentType); } FilterDefinition <BsonDocument> filter = Builders <BsonDocument> .Filter.Eq("_id", ObjectId.Parse(projectId)); BsonDocument projectDoc = await projectsColl.Find(filter).FirstOrDefaultAsync(); if (projectDoc == null) { continue; } var code = "sf_" + (string)projectDoc["projectCode"]; var isScripture = (bool)projectDoc["config"]["isTranslationDataScripture"]; IMongoCollection <BsonDocument> projectColl = realtimeDatabase.GetCollection <BsonDocument>(code); IMongoDatabase projectDatabase = _mongoClient.GetDatabase(code); IMongoCollection <BsonDocument> translateColl = projectDatabase.GetCollection <BsonDocument>("translate"); filter = Builders <BsonDocument> .Filter.Eq("isDeleted", false); using (IAsyncCursor <BsonDocument> cursor = await translateColl.Find(filter).ToCursorAsync()) { while (await cursor.MoveNextAsync()) { foreach (BsonDocument docInfo in cursor.Current) { var id = (ObjectId)docInfo["_id"]; filter = Builders <BsonDocument> .Filter.Eq("_id", $"{id}:{suffix}"); BsonDocument doc = await projectColl.Find(filter).FirstAsync(); if (isScripture) { texts.Add(new XForgeScriptureText(wordTokenizer, project.Id, doc)); } else { texts.Add(new XForgeRichText(segmentTokenizer, wordTokenizer, project.Id, doc)); } } } } } return(texts); }