public void Create_EmptyOps_EmptySegments()
        {
            var doc = new BsonDocument
            {
                { "_id", "abc123:MAT:1:target" },
                { "ops", new BsonArray
                  {
                      // Empty ops array
                  } }
            };
            var numberOps      = 0;
            var numberSegments = 0;
            var bookNumber     = 40;
            var chapterNumber  = 1;
            var projectId      = "myProject";

            Assert.That(((BsonArray)doc["ops"]).Count, Is.EqualTo(numberOps), "Setup");
            var tokenizer = new LatinWordTokenizer();

            // SUT
            var text = new SFScriptureText(tokenizer, projectId, bookNumber, chapterNumber, doc);

            Assert.That(text.Id, Is.EqualTo($"{projectId}_{bookNumber}_{chapterNumber}"));
            Assert.That(text.Segments.Count(), Is.EqualTo(numberSegments));
        }
Exemple #2
0
        public Task <ITextCorpus> CreateAsync(IEnumerable <string> projects, TextCorpusType type)
        {
            var wordTokenizer = new LatinWordTokenizer();
            var texts         = new List <IText>();

            foreach (string projectId in projects)
            {
                string dir = null;
                switch (type)
                {
                case TextCorpusType.Source:
                    dir = "source";
                    break;

                case TextCorpusType.Target:
                    dir = "target";
                    break;
                }

                foreach (string file in Directory.EnumerateFiles(Path.Combine(_textFileDir, projectId, dir), "*.txt"))
                {
                    var text = new TextFileText(wordTokenizer, $"{projectId}_{Path.GetFileNameWithoutExtension(file)}",
                                                file);
                    texts.Add(text);
                }
            }

            return(Task.FromResult <ITextCorpus>(new DictionaryTextCorpus(texts)));
        }
Exemple #3
0
        public void Segments_EmptyDoc()
        {
            var tokenizer = new LatinWordTokenizer();
            var doc       = new XDocument(new XElement("TermRenderingsList"));
            var text      = new SFBiblicalTermsText(tokenizer, "project01", doc);

            Assert.That(text.Segments, Is.Empty);
        }
        public void Texts()
        {
            var tokenizer = new LatinWordTokenizer();
            var corpus    = new UsfmFileTextCorpus(tokenizer, CorporaTestHelpers.UsfmStylesheetPath,
                                                   Encoding.UTF8, CorporaTestHelpers.UsfmTestProjectPath);

            Assert.That(corpus.Texts.Select(t => t.Id), Is.EquivalentTo(new[] { "MAT", "MRK" }));
        }
        public void Create_HasDocOps_HasSegments()
        {
            // Make a BsonDocument that looks like data
            // from SF DB - xforge - texts.
            var doc = new BsonDocument
            {
                { "_id", "abc123:MAT:1:target" },
                { "ops", new BsonArray
                  {
                      new BsonDocument
                      {
                          { "insert", new BsonDocument
                            {
                                { "chapter", new BsonDocument
                                    {
                                        { "number", "1" },
                                        { "style", "c" }
                                    } }
                            } }
                      },
                      new BsonDocument
                      {
                          { "insert", new BsonDocument
                            {
                                { "verse", new BsonDocument
                                    {
                                        { "number", "1" },
                                        { "style", "v" }
                                    } }
                            } }
                      },
                      new BsonDocument
                      {
                          { "insert", "First verse text here" },
                          { "attributes", new BsonDocument
                            {
                                { "segment", "verse_1_1" }
                            } }
                      }
                  } }
            };
            var numberOps      = 3;
            var numberSegments = 1;
            var bookNumber     = 40;
            var chapterNumber  = 1;
            var projectId      = "myProject";

            Assert.That(((BsonArray)doc["ops"]).Count, Is.EqualTo(numberOps), "Setup");
            var tokenizer = new LatinWordTokenizer();

            // SUT
            var text = new SFScriptureText(tokenizer, projectId, bookNumber, chapterNumber, doc);

            Assert.That(text.Id, Is.EqualTo($"{projectId}_{bookNumber}_{chapterNumber}"));
            Assert.That(text.Segments.Count(), Is.EqualTo(numberSegments));
        }
Exemple #6
0
        public void Segments_Guess()
        {
            var tokenizer = new LatinWordTokenizer();
            var doc       = new XDocument(new XElement("TermRenderingsList",
                                                       TermRendering("term1", guess: true, "Term1"),
                                                       TermRendering("term2", guess: true, "Term2")));
            var text = new SFBiblicalTermsText(tokenizer, "project01", doc);

            Assert.That(text.Segments, Is.Empty);
        }
Exemple #7
0
        public TranslationEngine(string baseUrl, string projectId, IHttpClient httpClient = null)
        {
            ProjectId = projectId;
            var wordTokenizer = new LatinWordTokenizer();

            SourceWordTokenizer  = wordTokenizer;
            TargetWordTokenizer  = wordTokenizer;
            RestClient           = new TranslationRestClient(baseUrl, httpClient ?? new AjaxHttpClient());
            ErrorCorrectionModel = new ErrorCorrectionModel();
            _cts = new CancellationTokenSource();
        }
        public void Segments_EmptyText()
        {
            var tokenizer = new LatinWordTokenizer();
            var corpus    = new UsfmFileTextCorpus(tokenizer, CorporaTestHelpers.UsfmStylesheetPath,
                                                   Encoding.UTF8, CorporaTestHelpers.UsfmTestProjectPath);

            IText text = corpus.GetText("MRK");

            TextSegment[] segments = text.Segments.ToArray();
            Assert.That(segments, Is.Empty);
        }
        public void Create_NullDoc_Crash()
        {
            BsonDocument doc           = null;
            var          bookNumber    = 40;
            var          chapterNumber = 1;
            var          projectId     = "myProject";
            var          tokenizer     = new LatinWordTokenizer();

            // SUT
            Assert.Throws <ArgumentNullException>(() => new SFScriptureText(tokenizer, projectId, bookNumber, chapterNumber, doc));
        }
        public void TryGetText()
        {
            var tokenizer = new LatinWordTokenizer();
            var corpus    = new UsfmFileTextCorpus(tokenizer, CorporaTestHelpers.UsfmStylesheetPath,
                                                   Encoding.UTF8, CorporaTestHelpers.UsfmTestProjectPath);

            IText text;

            Assert.That(corpus.TryGetText("MAT", out text), Is.True);
            Assert.That(text.Id, Is.EqualTo("MAT"));
            Assert.That(corpus.TryGetText("LUK", out _), Is.False);
        }
Exemple #11
0
        public void Segments_Renderings()
        {
            var tokenizer = new LatinWordTokenizer();
            var doc       = new XDocument(new XElement("TermRenderingsList",
                                                       TermRendering("term2", guess: false, "Term2"),
                                                       TermRendering("term1", guess: false, "Term1")));
            var text = new SFBiblicalTermsText(tokenizer, "project01", doc);

            TextSegment[] segments = text.Segments.ToArray();
            Assert.That(segments.Length, Is.EqualTo(2));
            Assert.That(segments[0].SegmentRef.ToString(), Is.EqualTo("term1"));
            Assert.That(string.Join(" ", segments[0].Segment), Is.EqualTo("Term1"));
            Assert.That(string.Join(" ", segments[1].Segment), Is.EqualTo("Term2"));
        }
        public void Create_NoSegments_EmptySegments()
        {
            var doc = new BsonDocument
            {
                { "_id", "abc123:MAT:1:target" },
                { "ops", new BsonArray
                  {
                      new BsonDocument
                      {
                          { "insert", new BsonDocument
                            {
                                { "chapter", new BsonDocument
                                    {
                                        { "number", "1" },
                                        { "style", "c" }
                                    } }
                            } }
                      },
                      new BsonDocument
                      {
                          { "insert", new BsonDocument
                            {
                                { "verse", new BsonDocument
                                    {
                                        { "number", "1" },
                                        { "style", "v" }
                                    } }
                            } }
                      }
                      // No verse text inserts with a segment reference.
                  } }
            };
            var numberOps      = 2;
            var numberSegments = 0;
            var bookNumber     = 40;
            var chapterNumber  = 1;
            var projectId      = "myProject";

            Assert.That(((BsonArray)doc["ops"]).Count, Is.EqualTo(numberOps), "Setup");
            var tokenizer = new LatinWordTokenizer();

            // SUT
            var text = new SFScriptureText(tokenizer, projectId, bookNumber, chapterNumber, doc);

            Assert.That(text.Id, Is.EqualTo($"{projectId}_{bookNumber}_{chapterNumber}"));
            Assert.That(text.Segments.Count(), Is.EqualTo(numberSegments));
        }
        public void Create_MissingOps_Crash()
        {
            var doc = new BsonDocument
            {
                { "_id", "abc123:MAT:1:target" },
                // Missing ops
            };
            var bookNumber    = 40;
            var chapterNumber = 1;
            var projectId     = "myProject";

            Assert.That(doc.Contains("ops"), Is.False, "Setup");
            var tokenizer = new LatinWordTokenizer();

            // SUT
            Assert.Throws <ArgumentException>(() => new SFScriptureText(tokenizer, projectId, bookNumber, chapterNumber, doc));
        }
        private async Task <IReadOnlyList <IText> > CreateTextsAsync(IEnumerable <string> projects,
                                                                     TextCorpusType type)
        {
            StringTokenizer wordTokenizer = new LatinWordTokenizer();
            IMongoDatabase  database      = _mongoClient.GetDatabase(_dataAccessOptions.Value.MongoDatabaseName);
            IMongoCollection <BsonDocument> textDataColl = database.GetCollection <BsonDocument>(
                SFDataAccessConstants.TextDataCollectionName);
            var texts = new List <IText>();

            foreach (string projectId in projects)
            {
                TextType textType;
                switch (type)
                {
                case TextCorpusType.Source:
                    textType = TextType.Source;
                    break;

                case TextCorpusType.Target:
                    textType = TextType.Target;
                    break;

                default:
                    throw new InvalidEnumArgumentException(nameof(type), (int)type, typeof(TextType));
                }

                List <TextEntity> textList = await _texts.Query().Where(t => t.ProjectRef == projectId).ToListAsync();

                foreach (TextEntity text in textList)
                {
                    foreach (Chapter chapter in text.Chapters)
                    {
                        FilterDefinition <BsonDocument> filter = Builders <BsonDocument> .Filter
                                                                 .Eq("_id", TextEntity.GetTextDataId(text.Id, chapter.Number, textType));

                        BsonDocument doc = await textDataColl.Find(filter).FirstAsync();

                        texts.Add(new SFScriptureText(wordTokenizer, projectId, text.Id, chapter.Number, doc));
                    }
                }
            }

            return(texts);
        }
        public void Segments_NonEmptyText()
        {
            var tokenizer = new LatinWordTokenizer();
            var corpus    = new UsfmFileTextCorpus(tokenizer, CorporaTestHelpers.UsfmStylesheetPath,
                                                   Encoding.UTF8, CorporaTestHelpers.UsfmTestProjectPath);

            IText text = corpus.GetText("MAT");

            TextSegment[] segments = text.Segments.ToArray();
            Assert.That(segments.Length, Is.EqualTo(10));
            Assert.That(segments[0].SegmentRef, Is.EqualTo(new VerseRef("MAT 1:1", corpus.Versification)));
            Assert.That(segments[0].Segment, Is.EqualTo("Chapter one , verse one .".Split()));
            Assert.That(segments[1].SegmentRef, Is.EqualTo(new VerseRef("MAT 1:2", corpus.Versification)));
            Assert.That(segments[1].Segment, Is.EqualTo("Chapter one , verse two .".Split()));
            Assert.That(segments[5].SegmentRef, Is.EqualTo(new VerseRef("MAT 2:1", corpus.Versification)));
            Assert.That(segments[5].Segment, Is.EqualTo("Chapter two , verse one .".Split()));
            Assert.That(segments[9].SegmentRef, Is.EqualTo(new VerseRef("MAT 2:5", corpus.Versification)));
            Assert.That(segments[9].Segment, Is.EqualTo("Chapter two , verse five .".Split()));
        }
Exemple #16
0
        private async Task <IReadOnlyList <IText> > CreateTextsAsync(IEnumerable <string> projects,
                                                                     TextCorpusType type)
        {
            StringTokenizer wordTokenizer = new LatinWordTokenizer();
            IMongoDatabase  database      = _mongoClient.GetDatabase(_dataAccessOptions.Value.MongoDatabaseName);
            IMongoCollection <BsonDocument> textDataColl = database.GetCollection <BsonDocument>(
                _realtimeService.GetCollectionName <TextData>());
            var texts = new List <IText>();

            foreach (string projectId in projects)
            {
                var project = await _realtimeService.GetSnapshotAsync <SFProject>(projectId);

                if (string.IsNullOrWhiteSpace(project.TranslateConfig.Source?.ProjectRef))
                {
                    throw new DataNotFoundException("The source project reference is missing");
                }
                string textCorpusProjectId = type switch
                {
                    TextCorpusType.Source => project.TranslateConfig.Source.ProjectRef,
                    TextCorpusType.Target => projectId,
                    _ => throw new InvalidEnumArgumentException(nameof(type), (int)type, typeof(TextCorpusType)),
                };
                foreach (TextInfo text in project.Texts.Where(t => t.HasSource))
                {
                    foreach (Chapter chapter in text.Chapters)
                    {
                        string id = TextData.GetTextDocId(textCorpusProjectId, text.BookNum, chapter.Number);
                        FilterDefinition <BsonDocument> filter = Builders <BsonDocument> .Filter.Eq("_id", id);

                        BsonDocument doc = await textDataColl.Find(filter).FirstOrDefaultAsync();

                        if (doc != null && doc.TryGetValue("ops", out BsonValue ops) && ops as BsonArray != null)
                        {
                            texts.Add(new SFScriptureText(wordTokenizer, projectId, text.BookNum, chapter.Number, doc));
                        }
                    }
                }
            }

            return(texts);
        }
Exemple #17
0
        private static void Approve_Success(Assert assert)
        {
            string source = "En el principio la Palabra ya existía.";
            string prefix = "In the beginning the Word already existed.";

            MockHttpClient httpClient = CreateWebClient();

            httpClient.Requests.Add(new MockRequest
            {
                Method = HttpRequestMethod.Post,
                Url    = "translation/engines/project:project1/actions/trainSegment",
                Action = (body, ct) =>
                {
                    var segmentPair = JsonConvert.DeserializeObject <SegmentPairDto>(body,
                                                                                     RestClientBase.SerializerSettings);
                    var tokenizer = new LatinWordTokenizer();
                    assert.DeepEqual(segmentPair.SourceSegment,
                                     tokenizer.TokenizeToStrings(source).ToArray());
                    assert.DeepEqual(segmentPair.TargetSegment, tokenizer.TokenizeToStrings(prefix).ToArray());
                    return(Task.FromResult(true));
                },
                ResponseText = ""
            });

            var    engine = new TranslationEngine("http://localhost/", "project1", httpClient);
            Action done   = assert.Async();

            engine.TranslateInteractively(source, 0.2, session =>
            {
                assert.NotEqual(session, null);
                session.Initialize();
                session.UpdatePrefix(prefix);
                session.Approve(success =>
                {
                    assert.Ok(success);
                    done();
                });
            });
        }
        private async Task <IReadOnlyList <IText> > CreateTextsAsync(IEnumerable <string> projects,
                                                                     TextCorpusType type)
        {
            StringTokenizer wordTokenizer = new LatinWordTokenizer();
            IMongoDatabase  database      = _mongoClient.GetDatabase(_dataAccessOptions.Value.MongoDatabaseName);
            IMongoCollection <BsonDocument> textDataColl = database.GetCollection <BsonDocument>(
                _realtimeService.GetCollectionName <TextData>());
            var texts = new List <IText>();

            foreach (string projectId in projects)
            {
                var project = await _realtimeService.GetSnapshotAsync <SFProject>(projectId);

                if (string.IsNullOrWhiteSpace(project.TranslateConfig.Source?.ProjectRef))
                {
                    throw new DataNotFoundException("The source project reference is missing");
                }

                string textCorpusProjectId;
                string paratextId;
                switch (type)
                {
                case TextCorpusType.Source:
                    textCorpusProjectId = project.TranslateConfig.Source.ProjectRef;
                    paratextId          = project.TranslateConfig.Source.ParatextId;
                    break;

                case TextCorpusType.Target:
                    textCorpusProjectId = projectId;
                    paratextId          = project.ParatextId;
                    break;

                default:
                    throw new InvalidEnumArgumentException(nameof(type), (int)type, typeof(TextCorpusType));
                }

                foreach (TextInfo text in project.Texts.Where(t => t.HasSource))
                {
                    foreach (Chapter chapter in text.Chapters)
                    {
                        string id = TextData.GetTextDocId(textCorpusProjectId, text.BookNum, chapter.Number);
                        FilterDefinition <BsonDocument> filter = Builders <BsonDocument> .Filter.Eq("_id", id);

                        BsonDocument doc = await textDataColl.Find(filter).FirstOrDefaultAsync();

                        if (doc != null && doc.TryGetValue("ops", out BsonValue ops) && ops as BsonArray != null)
                        {
                            texts.Add(new SFScriptureText(wordTokenizer, projectId, text.BookNum, chapter.Number, doc));
                        }
                    }
                }

                string termRenderingsFileName = Path.Combine(_siteOptions.Value.SiteDir, "sync", paratextId,
                                                             "target", "TermRenderings.xml");
                if (_fileSystemService.FileExists(termRenderingsFileName))
                {
                    using var stream = _fileSystemService.OpenFile(termRenderingsFileName, FileMode.Open);
                    XDocument termRenderingsDoc = await XDocument.LoadAsync(stream, LoadOptions.None,
                                                                            CancellationToken.None);

                    texts.Add(new SFBiblicalTermsText(wordTokenizer, projectId, termRenderingsDoc));
                }
            }

            return(texts);
        }
        private async Task <IReadOnlyList <IText> > CreateTextsAsync(IEnumerable <string> projects,
                                                                     TextCorpusType type)
        {
            StringTokenizer wordTokenizer                = new LatinWordTokenizer();
            IMongoDatabase  sfDatabase                   = _mongoClient.GetDatabase("scriptureforge");
            IMongoDatabase  realtimeDatabase             = _mongoClient.GetDatabase("realtime");
            IMongoCollection <BsonDocument> projectsColl = sfDatabase.GetCollection <BsonDocument>("projects");
            var texts = new List <IText>();

            foreach (string projectId in projects)
            {
                Project project = await _projectRepo.GetAsync(projectId);

                if (project == null)
                {
                    continue;
                }

                string segmentType = null;
                string suffix      = null;
                switch (type)
                {
                case TextCorpusType.Source:
                    suffix      = "source";
                    segmentType = project.SourceSegmentType;
                    break;

                case TextCorpusType.Target:
                    suffix      = "target";
                    segmentType = project.TargetSegmentType;
                    break;
                }
                StringTokenizer segmentTokenizer = null;
                if (segmentType != null)
                {
                    segmentTokenizer = WebApiUtils.CreateSegmentTokenizer(segmentType);
                }

                FilterDefinition <BsonDocument> filter = Builders <BsonDocument> .Filter.Eq("_id",
                                                                                            ObjectId.Parse(projectId));

                BsonDocument projectDoc = await projectsColl.Find(filter).FirstOrDefaultAsync();

                if (projectDoc == null)
                {
                    continue;
                }
                var code        = "sf_" + (string)projectDoc["projectCode"];
                var isScripture = (bool)projectDoc["config"]["isTranslationDataScripture"];

                IMongoCollection <BsonDocument> projectColl = realtimeDatabase.GetCollection <BsonDocument>(code);
                IMongoDatabase projectDatabase = _mongoClient.GetDatabase(code);
                IMongoCollection <BsonDocument> translateColl = projectDatabase.GetCollection <BsonDocument>("translate");
                filter = Builders <BsonDocument> .Filter.Eq("isDeleted", false);

                using (IAsyncCursor <BsonDocument> cursor = await translateColl.Find(filter).ToCursorAsync())
                {
                    while (await cursor.MoveNextAsync())
                    {
                        foreach (BsonDocument docInfo in cursor.Current)
                        {
                            var id = (ObjectId)docInfo["_id"];
                            filter = Builders <BsonDocument> .Filter.Eq("_id", $"{id}:{suffix}");

                            BsonDocument doc = await projectColl.Find(filter).FirstAsync();

                            if (isScripture)
                            {
                                texts.Add(new XForgeScriptureText(wordTokenizer, project.Id, doc));
                            }
                            else
                            {
                                texts.Add(new XForgeRichText(segmentTokenizer, wordTokenizer, project.Id, doc));
                            }
                        }
                    }
                }
            }

            return(texts);
        }