private static HashSet <string> GetOwnees(byte[] xmlBytes) { var ownees = new HashSet <string>(); var ichEnd = xmlBytes.Length; var objsurBounds = new ElementBounds(xmlBytes, s_tagsObjsur); while (objsurBounds.IsValid) { var type = objsurBounds.GetAttributeValue(s_tAttr); if (type == "o") { var guid = objsurBounds.GetAttributeValue(s_guidAttr); if (!String.IsNullOrEmpty(guid)) { ownees.Add(guid.ToLowerInvariant()); } } objsurBounds.Reset(objsurBounds.EndTagOffset, ichEnd); } return(ownees); }
internal static int GetEndOffset(byte[] xmlBytes) { var baseBounds = new ElementBounds(xmlBytes, s_tagsCmBaseAnnotation); if (!baseBounds.IsValid) return 0; var offsetBounds = new ElementBounds(xmlBytes, s_tagsEndOffset, baseBounds.BeginTagOffset, baseBounds.EndTagOffset); if (!offsetBounds.IsValid) return 0; var val = offsetBounds.GetAttributeValue(s_valAttr); return String.IsNullOrEmpty(val) ? 0 : int.Parse(val); }
private static XElement GetOptionalComment(byte[] xmlBytes, string elementName) { var annoBounds = new ElementBounds(xmlBytes, s_tagsCmAnnotation); var commentBounds = new ElementBounds(xmlBytes, s_tagsComment, annoBounds.BeginTagOffset, annoBounds.EndTagOffset); if (!commentBounds.IsValid) return null; return new XElement(elementName, GetAStrElements(xmlBytes, commentBounds.BeginTagOffset, commentBounds.EndTagOffset)); }
private static List<XElement> GetAStrElements(byte[] xmlBytes, int ichMin, int ichLim) { var retval = new List<XElement>(); var astrBounds = new ElementBounds(xmlBytes, s_tagsAStr, ichMin, ichLim); while (astrBounds.IsValid) { var astr = Encoding.UTF8.GetString(xmlBytes, astrBounds.BeginTagOffset, astrBounds.Length); var xAStr = XElement.Parse(astr); retval.Add(xAStr); astrBounds.Reset(astrBounds.EndTagOffset, ichLim); } return retval; }
private static HashSet<string> GetOwnees(byte[] xmlBytes) { var ownees = new HashSet<string>(); var ichEnd = xmlBytes.Length; var objsurBounds = new ElementBounds(xmlBytes, s_tagsObjsur); while (objsurBounds.IsValid) { var type = objsurBounds.GetAttributeValue(s_tAttr); if (type == "o") { var guid = objsurBounds.GetAttributeValue(s_guidAttr); if (!String.IsNullOrEmpty(guid)) ownees.Add(guid.ToLowerInvariant()); } objsurBounds.Reset(objsurBounds.EndTagOffset, ichEnd); } return ownees; }
private static IEnumerable<XElement> ConvertCCAsAndAddCCRObjSurElements( IDomainObjectDTORepository dtoRepos, IDictionary<string, SortedList<int, byte[]>> paraToOldSegments, ICollection<KeyValuePair<byte[], XElement>> halfBakedCcwgItems, IDictionary<string, byte[]> oldCCAs, IDictionary<Guid, Guid> ccaGuidMap, List<string> objsurElementGuids, string owningCCRGuid) { // 'retval' will be put into the new CCR of the caller as owning objsur elements. var retval = new List<XElement>(objsurElementGuids.Count()); // Decide which class to convert the old CCA to: // 1. CmBaseAnnotations -> ConstChartTag // 2. CmIndirectAnnotation // A. 1 (or more) CCRs in AppliesTo-> ConstChartClauseMarker // B. 1 (only) CCA in AppliesTo -> ConstChartMovedTextMarker // C. null AppliesTo -> ConstChartWordGroup // D. 1 (or more) twfics (pfics?) -> ConstChartWordGroup const string kChangeMe = "CHANGEME"; foreach (var oldCCAGuid in objsurElementGuids) { var guidOldCCA = new Guid(oldCCAGuid); // Earlier 'SortOutMultipleXficBackRefs()' may have left a dangling reference. // If so, skip it. //XElement oldAnnElement; byte[] oldAnnElement; if (!oldCCAs.TryGetValue(oldCCAGuid, out oldAnnElement)) continue; //var oldAnnElement = oldCCAs[oldCCAGuid]; // Leave it in, so we can get at its XElement, whenever needed. //oldCCAs.Remove(oldCCAGuid); var guidNew = ccaGuidMap[guidOldCCA]; var newGuid = guidNew.ToString().ToLowerInvariant(); var newClassName = kChangeMe; // Collect up the inner class-level elements. var cmAnnotationBounds = new ElementBounds(oldAnnElement, s_tagsCmAnnotation); if (!cmAnnotationBounds.IsValid) continue; // Fix FWR-2139 crash migrating because of missing InstanceOf // Skip chart annotation with no column reference. var guidInstanceOf = GetInstanceOfGuid(oldAnnElement); if (String.IsNullOrEmpty(guidInstanceOf)) continue; var mergesAfterElement = new XElement("MergesAfter", new XAttribute("val", "False")); var mergesBeforeElement = new XElement("MergesBefore", new XAttribute("val", "False")); // May be null. var compDetailsBounds = new ElementBounds(oldAnnElement, s_tagsCompDetails, cmAnnotationBounds.BeginTagOffset, cmAnnotationBounds.EndTagOffset); if (compDetailsBounds.IsValid) { var uniBounds = new ElementBounds(oldAnnElement, s_tagsUni, compDetailsBounds.BeginTagOffset, compDetailsBounds.EndTagOffset); if (uniBounds.IsValid) { // See if some optional stuff in 'oldCompDetailsElement' will change MergesAfter or MergesBefore. var mergeAfter = GetAttributeValue(oldAnnElement, s_mergeAfterAttr, uniBounds.BeginTagOffset, uniBounds.EndTagOffset); if (mergeAfter == "true") mergesAfterElement.Attribute("val").Value = "True"; var mergeBefore = GetAttributeValue(oldAnnElement, s_mergeBeforeAttr, uniBounds.BeginTagOffset, uniBounds.EndTagOffset); if (mergeBefore == "true") mergesBeforeElement.Attribute("val").Value = "True"; } } // Reset the Name and add other elements really soon now, // depending on which subclass of ConstituentChartCellPart is used. var newSpecificClassElement = new XElement(newClassName); var newClassAttr = new XAttribute("class", newClassName); var newCCAElement = new XElement("rt", newClassAttr, new XAttribute("guid", newGuid), new XAttribute("ownerguid", owningCCRGuid), new XElement("CmObject"), new XElement("ConstituentChartCellPart", new XElement("Column", new XElement("objsur", new XAttribute("t", "r"), new XAttribute("guid", guidInstanceOf))), mergesAfterElement, mergesBeforeElement), newSpecificClassElement); var classValue = GetClassValue(oldAnnElement); switch (classValue) { default: throw new InvalidOperationException("Unrecognized annotation class used as CCA."); case "CmBaseAnnotation": // #1. newClassName = "ConstChartTag"; newSpecificClassElement.Name = newClassName; newClassAttr.Value = newClassName; // Tag is atomic ref. var guidBeginObject = GetBeginObjectGuid(oldAnnElement); if (!String.IsNullOrEmpty(guidBeginObject)) { newSpecificClassElement.Add( new XElement("Tag", new XElement("objsur", new XAttribute("t", "r"), new XAttribute("guid", guidBeginObject)))); } break; case "CmIndirectAnnotation": // #2. // Get the optional 'AppliesTo' element. var refsFromAppliesTo = GetAppliesToObjsurGuids(oldAnnElement); if (refsFromAppliesTo == null || refsFromAppliesTo.Count == 0) { // 2.C newClassName = "ConstChartWordGroup"; newSpecificClassElement.Name = newClassName; newClassAttr.Value = newClassName; // BeginSegment & EndSegment are to be null, so leave them out altogether. // BeginAnalysisIndex & EndAnalysisIndex are both -1. // Note: This is actually wrong; this should be a ConstChartTag with no Tag // But it gets fixed in DM7000013. newSpecificClassElement.Add(new XElement("BeginAnalysisIndex", new XAttribute("val", "-1"))); newSpecificClassElement.Add(new XElement("EndAnalysisIndex", new XAttribute("val", "-1"))); } else { // Get the class of the first (or only) objsur. var currentRefGuid = refsFromAppliesTo[0]; var currentInnerTarget = oldCCAs[refsFromAppliesTo[0]]; switch (GetAnnotationTypeGuid(currentInnerTarget)) { default: throw new InvalidOperationException("Unrecognized annotation type for CCA."); case DataMigrationServices.kConstituentChartRowAnnDefnGuid: // One, or more, CCRs. // 2.A newClassName = "ConstChartClauseMarker"; newSpecificClassElement.Name = newClassName; newClassAttr.Value = newClassName; var dependentClausesElement = new XElement("DependentClauses"); newSpecificClassElement.Add(dependentClausesElement); foreach (var guid in refsFromAppliesTo) { // DependentClauses is ref. seq. prop. dependentClausesElement.Add( DataMigrationServices.CreateReferenceObjSurElement( ccaGuidMap[new Guid(guid)].ToString().ToLowerInvariant())); } break; case DataMigrationServices.kConstituentChartAnnotationAnnDefnGuid: // Single CCA. // 2.B newClassName = "ConstChartMovedTextMarker"; newSpecificClassElement.Name = newClassName; newClassAttr.Value = newClassName; // WordGroup - Get new guid from cca guid map using old CCA guid. newSpecificClassElement.Add(new XElement("WordGroup", DataMigrationServices.CreateReferenceObjSurElement(ccaGuidMap[new Guid(currentRefGuid)].ToString().ToLowerInvariant()))); // Preposed - Boolean. // The data migration for the Preposed boolean is simple. // If the old Marker's Comment property contains a string "<<" then it's true, // false otherwise. newSpecificClassElement.Add(new XElement("Preposed", new XAttribute("val", GetPreposedBooleanFromComment(oldAnnElement, cmAnnotationBounds.BeginTagOffset, cmAnnotationBounds.EndTagOffset)))); break; case DataMigrationServices.kPficAnnDefnGuid: // Fall through. case DataMigrationServices.kTwficAnnDefnGuid: // One, or more, twfics/pfics // These all go into halfBakedCcwgItems, // and will be finished when Segments and xfics are finished. // NB: While there may be multiple xfics, // only the first and last are stored in the two indices. var firstXficGuid = currentRefGuid; var lastXficGuid = refsFromAppliesTo[refsFromAppliesTo.Count - 1]; var firstXficInnerAnnElement = XElement.Parse(dtoRepos.GetDTO(firstXficGuid).Xml).Element("CmBaseAnnotation"); // Gotta make sure the xfics and segments are in the same paragraph. var paraGuid = GetGuid(firstXficInnerAnnElement.Element("BeginObject").Element("objsur")); var beginOffsetElement = firstXficInnerAnnElement.Element("BeginOffset"); var firstXfixBeginOffset = beginOffsetElement == null ? 0 : int.Parse(beginOffsetElement.Attribute("val").Value); var newSegmentGuid = kChangeMe; try { foreach (var kvp in paraToOldSegments[paraGuid].TakeWhile(kvp => kvp.Key <= firstXfixBeginOffset)) { // Found the right segment, so get its new segment guid. newSegmentGuid = ccaGuidMap[new Guid(GetGuid(kvp.Value))].ToString().ToLowerInvariant(); } } catch (KeyNotFoundException) { // Upon finding an orphaned chart cell with an invalid paragraph, skip it. continue; } if (newSegmentGuid == kChangeMe) { // We might have some segments (better check), but there are xfics that aren't // covered by a segment, so try to recover the broken data, as much as possible. newSegmentGuid = AddExtraInitialSegment(paraGuid, ccaGuidMap, paraToOldSegments); } halfBakedCcwgItems.Add(new KeyValuePair<byte[], XElement>(oldAnnElement, newCCAElement)); newClassName = "ConstChartWordGroup"; newSpecificClassElement.Name = newClassName; newClassAttr.Value = newClassName; newSpecificClassElement.Add(new XElement("BeginSegment", DataMigrationServices.CreateReferenceObjSurElement(newSegmentGuid))); newSpecificClassElement.Add(new XElement("EndSegment", DataMigrationServices.CreateReferenceObjSurElement(newSegmentGuid))); // For now, just store the guid of the first xfic. // It's the wrong data type, but xml won't care, // and, they will get changed later on. newSpecificClassElement.Add(new XElement("BeginAnalysisIndex", new XAttribute("val", firstXficGuid))); // For now, just store the guid of the last xfic. // It's the wrong data type, but xml won't care, // and, they will get changed later on. newSpecificClassElement.Add(new XElement("EndAnalysisIndex", new XAttribute("val", lastXficGuid))); break; } } break; } // Create new owning objSur Element. retval.Add(DataMigrationServices.CreateOwningObjSurElement(newGuid)); // Add newly converted CCA to repos. dtoRepos.Add(new DomainObjectDTO(newGuid, newClassName, newCCAElement.ToString())); } return retval; }
private static bool EnsurePficHasInstanceOf( IDomainObjectDTORepository dtoRepos, DomainObjectDTO dtoPfic, IDictionary<string, DomainObjectDTO> newPunctForms) { var pficElement = dtoPfic.XmlBytes; /* <Contents> <Str> <Run ws="eZPI">Ne Wlalo lo San Juan. </Run> <Run ws="es">Otras cosas.</Run> </Str> </Contents> */ // Find relevant paragraph from BeginObject property of 'pficElement'. var paraDto = dtoRepos.GetDTO(GetBeginObjectGuid(pficElement)); var beginOffset = GetBeginOffset(pficElement); var endOffset = GetEndOffset(pficElement); if (beginOffset > endOffset) { // Bad begin or end offset. MarkParaAsNeedingTokenization(dtoRepos, paraDto); return false; } if (beginOffset < 0) { // Bad begin offset. MarkParaAsNeedingTokenization(dtoRepos, paraDto); return false; } // Concatenate data from all runs. List<string> writingSystems; var runs = GetParagraphContentRuns(paraDto.XmlBytes, out writingSystems); Debug.Assert(runs.Count == writingSystems.Count); var bldr = new StringBuilder(); for (int i = 0; i < runs.Count; ++i) bldr.Append(runs[i]); var fullParaContents = Icu.Normalize(bldr.ToString(), Icu.UNormalizationMode.UNORM_NFD); if (endOffset > fullParaContents.Length) { // Total string is too short (end offset beyond end of string). MarkParaAsNeedingTokenization(dtoRepos, paraDto); return false; } // Find the form of the punctuation mark. var newForm = fullParaContents.Substring(beginOffset, endOffset - beginOffset); if (newForm == String.Empty) { // No punctuation form at all. MarkParaAsNeedingTokenization(dtoRepos, paraDto); return false; } var icuLocale = String.Empty; // Find the ws's IcuLocale at the begin offset in whichever run it is in. var currentTotalLength = 0; for (var i = 0; i < runs.Count; ++i) { var currentRunText = Icu.Normalize(runs[i], Icu.UNormalizationMode.UNORM_NFD); currentTotalLength += currentRunText.Length; if (beginOffset >= currentTotalLength) continue; // Not in this run. if (endOffset > currentTotalLength) { // It starts in one run and ends in another, so bail out. MarkParaAsNeedingTokenization(dtoRepos, paraDto); return false; } // It's all in this run, so quit looking at runs. icuLocale = writingSystems[i]; break; } if (icuLocale == String.Empty) { // Hard to say how we can get here, but something is very wrong. MarkParaAsNeedingTokenization(dtoRepos, paraDto); return false; } // If the new PF is all in one run, and we have its IcuLocale, // then make the new PF object, and return true. // Find/Create PunctuationForm object that has a Form in the matching IcuLocale & matching string. var key = icuLocale + "-" + newForm; DomainObjectDTO dtoMatchingPf; if (!newPunctForms.TryGetValue(key, out dtoMatchingPf)) { // Create new PunctuationForm dto. var newPunctFormGuid = Guid.NewGuid().ToString().ToLower(); const string className = "PunctuationForm"; var newPfElement = new XElement("rt", new XAttribute("class", className), new XAttribute("guid", newPunctFormGuid), new XElement("CmObject"), new XElement("PunctuationForm", new XElement("Form", new XElement("Str", new XElement("Run", new XAttribute("ws", icuLocale), newForm))))); dtoMatchingPf = new DomainObjectDTO(newPunctFormGuid, className, newPfElement.ToString()); // Add new PunctuationForm to dtoRepos. dtoRepos.Add(dtoMatchingPf); // Add new PunctuationForm to newPunctForms. newPunctForms.Add(key, dtoMatchingPf); } // Assign InstanceOf for pficElement to matching PunctuationForm object. // NB: No need to mess with registering it as modified, // since it gets deleted anyway later on. var innerBounds = new ElementBounds(pficElement, s_tagsCmAnnotation); Debug.Assert(innerBounds.IsValid); var pficBytes = new List<byte>(pficElement); var instanceOf = String.Format("<InstanceOf>{1}{0}{1}</InstanceOf>{1}", DataMigrationServices.CreateReferenceObjSurElement(dtoMatchingPf.Guid), Environment.NewLine); pficBytes.InsertRange(innerBounds.EndTagOffset, Encoding.UTF8.GetBytes(instanceOf)); dtoPfic.XmlBytes = pficBytes.ToArray(); return true; }
private static List<KeyValuePair<byte[], XElement>> ProcessDiscourseData(IDomainObjectDTORepository dtoRepos, IDictionary<string, SortedList<int, byte[]>> paraToOldSegments, IEnumerable<byte[]> oldCCRs, IDictionary<string, byte[]> oldCCAs, IDictionary<Guid, Guid> ccaGuidMap) // Key is old CCA guid. Value is new CCA guid. { // Make a mapping between old CCA anns and new ConstChartWordGroup objects (both XElements), // which are the ones where the old ann had twfics in AppliesTo. // These will be fed into the code that converts the twfics, // so as to have access to the right conversion context // (i.e., be able to get at the Segment and start/end indices for the twfics). var halfBakedCcwgItems = new List<KeyValuePair<byte[], XElement>>(); // Map between old CCR ann guid and new CCR guid, so code in here can keep track of them. // Key is the original CCR guid. Value is the new CCR guid. //var ccrRowsGuidMap = new Dictionary<string, string>(); // Key is the new CCR guid. Value is the guid of its new owning chart. var ccrOwnerGuidMap = new Dictionary<Guid, Guid>(); // Migrate the DsConstChart(s). foreach (var chartDto in dtoRepos.AllInstancesSansSubclasses("DsConstChart")) { var chartElement = XElement.Parse(chartDto.Xml); foreach (var objsurElement in chartElement.Element("DsConstChart").Elements("Rows").Elements("objsur")) { // Change to owning. objsurElement.Attribute("t").Value = "o"; // Change the guid. var guidAttr = objsurElement.Attribute("guid"); var newCCRGuid = ccaGuidMap[new Guid(guidAttr.Value)]; // Remember the owner guid (Chart) for the new CCR guid. // Key is the new guid for the new CCR. // Value is the owning chart. ccrOwnerGuidMap.Add(newCCRGuid, new Guid(chartDto.Guid)); guidAttr.Value = newCCRGuid.ToString().ToLowerInvariant(); } // Tell dto repos of the modification of the chart. chartDto.Xml = chartElement.ToString(); dtoRepos.Update(chartDto); } // Migrate the CCR and CCA annotations. foreach (var oldCCR in oldCCRs) { // Collect up the inner class-level elements. var cmAnnotationBounds = new ElementBounds(oldCCR, s_tagsCmAnnotation); // May be null. var oldCompDetailsBounds = new ElementBounds(oldCCR, s_tagsCompDetails, cmAnnotationBounds.BeginTagOffset, cmAnnotationBounds.EndTagOffset); var oldTextBounds = new ElementBounds(oldCCR, s_tagsText, cmAnnotationBounds.BeginTagOffset, cmAnnotationBounds.EndTagOffset); // oldCommentBounds is unused. Hopefully by design?! //var oldCommentBounds = new ElementBounds(oldCCR, s_tagsComment, // cmAnnotationBounds.BeginTagOffset, cmAnnotationBounds.EndTagOffset); // May be null, or at least have no 'objsur' elements. var refsFromAppliesTo = GetAppliesToObjsurGuids(oldCCR); // Try to make a Notes element. It may be null. XElement notesElement = null; if (oldTextBounds.IsValid) { // Get the StText dto and element. var stTextGuid = GetObjsurGuid(oldCCR, oldTextBounds.BeginTagOffset, oldTextBounds.EndTagOffset); if (!String.IsNullOrEmpty(stTextGuid)) { var stTextDto = dtoRepos.GetDTO(stTextGuid); var stTextBounds = new ElementBounds(stTextDto.XmlBytes, s_tagsStText); var paragraphsBounds = new ElementBounds(stTextDto.XmlBytes, s_tagsParagraphs, stTextBounds.BeginTagOffset, stTextBounds.EndTagOffset); // See if stTextElement has any paras (StTxtPara) if (paragraphsBounds.IsValid) { // Get the first para. var firstParaGuid = GetObjsurGuid(stTextDto.XmlBytes, paragraphsBounds.BeginTagOffset, paragraphsBounds.EndTagOffset); if (!String.IsNullOrEmpty(firstParaGuid)) { var firstParaDto = dtoRepos.GetDTO(firstParaGuid); var stTxtParaBounds = new ElementBounds(firstParaDto.XmlBytes, s_tagsStTxtPara); var contentsBounds = new ElementBounds(firstParaDto.XmlBytes, s_tagsContents, stTxtParaBounds.BeginTagOffset, stTxtParaBounds.EndTagOffset); var strBounds = new ElementBounds(firstParaDto.XmlBytes, s_tagsStr, contentsBounds.BeginTagOffset, contentsBounds.EndTagOffset); // See if it has any Contents. if (strBounds.IsValid) { // Move the Contents into a new Notes element. notesElement = new XElement("Notes", XElement.Parse(Encoding.UTF8.GetString(firstParaDto.XmlBytes, strBounds.BeginTagOffset, strBounds.Length))); } } } } } // Deal with 'ClauseType' property. var clauseTypeElement = new XElement("ClauseType", new XAttribute("val", "0")); // Deal with 'ClauseType' property. var endParagraphElement = new XElement("EndParagraph", new XAttribute("val", "False")); // Deal with 'ClauseType' property. var endSentenceElement = new XElement("EndSentence", new XAttribute("val", "False")); // Deal with 'ClauseType' property. var startDependentClauseGroupElement = new XElement("StartDependentClauseGroup", new XAttribute("val", "False")); // Deal with 'ClauseType' property. var endDependentClauseGroupElement = new XElement("EndDependentClauseGroup", new XAttribute("val", "False")); // See if some optional stuff in 'oldCompDetailsElement' will change it. var uniBounds = new ElementBounds(oldCCR, s_tagsUni, oldCompDetailsBounds.BeginTagOffset, oldCompDetailsBounds.EndTagOffset); if (uniBounds.IsValid) { // Turn its pseudo-xml string content into a real XElement. // It's string won't have angle brackets, so turn the entities into '<' and '>' // <CompDetails><Uni><ccinfo endSent="true"/></Uni></CompDetails> var ichMin = uniBounds.BeginTagOffset + s_tagsUni.BeginTag.Length; var ichLim = uniBounds.EndTagOffset; var cch = ichLim - ichMin; var details = Encoding.UTF8.GetString(oldCCR, ichMin, cch); if (details.Contains("&")) details = XmlUtils.DecodeXmlAttribute(details); var compDetailsElement = XElement.Parse(details); var optionalAttr = compDetailsElement.Attribute("dependent"); var foundOverride = false; if (optionalAttr != null && optionalAttr.Value.ToLower() == "true") { clauseTypeElement.Attribute("val").Value = "1"; foundOverride = true; } optionalAttr = compDetailsElement.Attribute("song"); if (!foundOverride && optionalAttr != null && optionalAttr.Value.ToLower() == "true") { clauseTypeElement.Attribute("val").Value = "2"; foundOverride = true; } optionalAttr = compDetailsElement.Attribute("speech"); if (!foundOverride && optionalAttr != null && optionalAttr.Value.ToLower() == "true") { clauseTypeElement.Attribute("val").Value = "3"; } // No more ClauseType attrs. // Move on to the other four optional boolean attrs. optionalAttr = compDetailsElement.Attribute("endSent"); if (optionalAttr != null && optionalAttr.Value.ToLower() == "true") endSentenceElement.Attribute("val").Value = "True"; optionalAttr = compDetailsElement.Attribute("endPara"); if (optionalAttr != null && optionalAttr.Value.ToLower() == "true") endParagraphElement.Attribute("val").Value = "True"; optionalAttr = compDetailsElement.Attribute("firstDep"); if (optionalAttr != null && optionalAttr.Value.ToLower() == "true") startDependentClauseGroupElement.Attribute("val").Value = "True"; optionalAttr = compDetailsElement.Attribute("endDep"); if (optionalAttr != null && optionalAttr.Value.ToLower() == "true") endDependentClauseGroupElement.Attribute("val").Value = "True"; } // Required 'Label' prop, which comes from the old 'Comment' // May be null, or at least have no 'objsur' elements. var enAltBounds = GetEnglishCommentBounds(oldCCR, cmAnnotationBounds.BeginTagOffset, cmAnnotationBounds.EndTagOffset); var enAlt = Encoding.UTF8.GetString(oldCCR, enAltBounds.BeginTagOffset, enAltBounds.Length); var enAltElement = XElement.Parse(enAlt); // Convert it to "Str" element with no "ws" attr. enAltElement.Name = "Str"; enAltElement.Attribute("ws").Remove(); // Create new ConstChartRow class (XElement & DTO). var oldGuid = GetGuid(oldCCR); var newGuid = ccaGuidMap[new Guid(oldGuid)]; var owningGuid = ccrOwnerGuidMap[newGuid]; ccrOwnerGuidMap.Remove(newGuid); const string className = "ConstChartRow"; var newCCRElement = new XElement("rt", new XAttribute("class", className), new XAttribute("guid", newGuid), new XAttribute("ownerguid", owningGuid), new XElement("CmObject"), new XElement(className, notesElement, // May be null. clauseTypeElement, endParagraphElement, endSentenceElement, startDependentClauseGroupElement, endDependentClauseGroupElement, new XElement("Label", enAltElement), AddCells(dtoRepos, paraToOldSegments, halfBakedCcwgItems, refsFromAppliesTo, oldCCAs, ccaGuidMap, newGuid.ToString().ToLowerInvariant()))); // Add DTO to repos. dtoRepos.Add(new DomainObjectDTO(newGuid.ToString().ToLowerInvariant(), className, newCCRElement.ToString())); } return halfBakedCcwgItems; }
private static void RemoveMismatchedAppliesToRefs(byte[] xmlBytes, ICollection<string> appliesToGuids) { var cmIndirectBounds = new ElementBounds(xmlBytes, s_tagsCmIndirect); if (!cmIndirectBounds.IsValid) return; var appliesToBounds = new ElementBounds(xmlBytes, s_tagsAppliesTo, cmIndirectBounds.BeginTagOffset, cmIndirectBounds.EndTagOffset); if (!appliesToBounds.IsValid) return; var objsurBounds = new ElementBounds(xmlBytes, s_tagsObjsur, appliesToBounds.BeginTagOffset, appliesToBounds.EndTagOffset); while (objsurBounds.IsValid) { var ichMin = objsurBounds.BeginTagOffset; var guid = objsurBounds.GetAttributeValue(s_guidAttr); if (!String.IsNullOrEmpty(guid)) guid = guid.ToLowerInvariant(); objsurBounds.Reset(objsurBounds.EndTagOffset, appliesToBounds.EndTagOffset); if (!String.IsNullOrEmpty(guid) && !appliesToGuids.Contains(guid)) { var ichLim = objsurBounds.BeginTagOffset; if (ichLim < 0) ichLim = appliesToBounds.EndTagOffset; // Remove the <objsur> element by overwriting it with spaces. for (var i = ichMin; i < ichLim; ++i) xmlBytes[i] = 0x20; } } }
private static string GetAnnotationTypeGuid(byte[] xmlBytes) { var cmAnnotationBounds = new ElementBounds(xmlBytes, s_tagsCmAnnotation); if (!cmAnnotationBounds.IsValid) return null; var annotationTypeBounds = new ElementBounds(xmlBytes, s_tagsAnnotationType, cmAnnotationBounds.BeginTagOffset, cmAnnotationBounds.EndTagOffset); if (!annotationTypeBounds.IsValid) return null; return GetObjsurGuid(xmlBytes, annotationTypeBounds.BeginTagOffset, annotationTypeBounds.EndTagOffset); }
private static List<string> GetParagraphContentRuns(byte[] xmlStTxtPara, out List<string> writingSystems) { var retval = new List<string>(); writingSystems = new List<string>(); var stTxtParaBounds = new ElementBounds(xmlStTxtPara, s_tagsStTxtPara); var contentsBounds = new ElementBounds(xmlStTxtPara, s_tagsContents, stTxtParaBounds.BeginTagOffset, stTxtParaBounds.EndTagOffset); var strBounds = new ElementBounds(xmlStTxtPara, s_tagsStr, contentsBounds.BeginTagOffset, contentsBounds.EndTagOffset); if (!strBounds.IsValid) return retval; var runBounds = new ElementBounds(xmlStTxtPara, s_tagsRun, strBounds.BeginTagOffset, strBounds.EndTagOffset); while (runBounds.IsValid) { var ws = runBounds.GetAttributeValue(s_wsAttr); writingSystems.Add(ws); var ichText = runBounds.EndOfStartTag + 1; // move past the > var runText = Encoding.UTF8.GetString(xmlStTxtPara, ichText, runBounds.EndTagOffset - ichText); retval.Add(XmlUtils.DecodeXmlAttribute(runText)); runBounds.Reset(runBounds.EndTagOffset, contentsBounds.EndTagOffset); } return retval; }
private static void ProcessParagraphs( IDomainObjectDTORepository dtoRepos, IDictionary<string, byte[]> oldCCAs, IEnumerable<KeyValuePair<byte[], XElement>> halfBakedCcwgItems, IDictionary<string, SortedList<int, byte[]>> paraToOldSegments, IDictionary<string, SortedList<int, byte[]>> paraToOldXfics, IDictionary<Guid, Guid> ccaGuidMap, ICollection<byte[]> oldTextTags, Dictionary<string, List<byte[]>> freeTrans, Dictionary<string, List<byte[]>> litTrans, Dictionary<string, List<byte[]>> notes) { var dtos = dtoRepos.AllInstancesWithSubclasses("StTxtPara"); //var count = dtos.Count(); //var num = 0; //var cpara = 0; foreach (var currentParaDto in dtos) { //++num; // If it has no contents, then skip it. var stTxtParaBounds = new ElementBounds(currentParaDto.XmlBytes, s_tagsStTxtPara); if (!stTxtParaBounds.IsValid) continue; var contentsBounds = new ElementBounds(currentParaDto.XmlBytes, s_tagsContents, stTxtParaBounds.BeginTagOffset, stTxtParaBounds.EndTagOffset); if (!contentsBounds.IsValid) continue; //++cpara; // Mark the paragraph as needing retokenization. MarkParaAsNeedingTokenization(dtoRepos, currentParaDto); var currentParaGuid = currentParaDto.Guid.ToLower(); SortedList<int, byte[]> xficsForCurrentPara; paraToOldXfics.TryGetValue(currentParaGuid, out xficsForCurrentPara); SortedList<int, byte[]> segsForCurrentPara; if (!paraToOldSegments.TryGetValue(currentParaGuid, out segsForCurrentPara) && xficsForCurrentPara != null && xficsForCurrentPara.Count > 0) { // We have no segments at all, but there are xfics, so try to recover the broken data, // as much as possible. // Need to create a new old segment XElement (not dto), to try and and keep old data. var guidBrandNewSeg = Guid.NewGuid(); var brandNewSegGuid = guidBrandNewSeg.ToString().ToLower(); ccaGuidMap.Add(guidBrandNewSeg, guidBrandNewSeg); segsForCurrentPara = new SortedList<int, byte[]>(); paraToOldSegments.Add(currentParaGuid, segsForCurrentPara); var bldr = new StringBuilder(); bldr.AppendFormat("<rt guid=\"{0}\"", brandNewSegGuid); bldr.Append("<CmObject/>"); bldr.Append("<CmBaseAnnotation>"); bldr.Append("<BeginOffset val=\"0\"/>"); bldr.AppendFormat("<EndOffset val=\"{0}\"/>", int.MaxValue); bldr.Append("</CmBaseAnnotation>"); bldr.Append("</rt>"); segsForCurrentPara.Add(0, Encoding.UTF8.GetBytes(bldr.ToString())); } // If the para has no segs or xfics, skip the following work. if (segsForCurrentPara == null) continue; if (xficsForCurrentPara != null && xficsForCurrentPara.Count > 0 && segsForCurrentPara.Count > 0) { // We have both segments and xfics. Check for odd case (like FWR-3081) // where the first segment starts AFTER the first xfic, and add a new // segment that covers the text up to the first current segment. if (xficsForCurrentPara.First().Key < segsForCurrentPara.First().Key) AddExtraInitialSegment(currentParaGuid, ccaGuidMap, paraToOldSegments); } var halfBakedCcwgItemsForCurrentPara = new List<KeyValuePair<byte[], XElement>>(); List<string> writingSystems; var runs = GetParagraphContentRuns(currentParaDto.XmlBytes, out writingSystems); // We may well have segments with no xfics, for example, Scripture that has segmented BT. if (xficsForCurrentPara != null) { // Since pfics/wfics were 'optional' and usually not maintained in the db, // we need to make sure there is a dummy one in xficsForCurrentPara // in order to get the correct Begin/EndAnalysisIndex for chart and tagging objects // It turns out we don't need to worry about ws and exact begin/end character offsets. // All we need to end up with correct indices is the correct NUMBER of xfics. var context = new ParagraphContext(currentParaGuid, xficsForCurrentPara); EnsureAllXfics(runs, context); // Find any 'halfbaked' items for the current paragraph. // Get the para for the first objsur's guid (some twfic ann), // in the CmIndirectAnnotation's AppliesTo prop. foreach (var kvp in halfBakedCcwgItems) { var refs = GetAppliesToObjsurGuids(kvp.Key); if (refs == null || refs.Count == 0) continue; var guid = refs[0]; var dto = dtoRepos.GetDTO(guid); var guidBegin = GetBeginObjectGuid(dto.XmlBytes); if (guidBegin == currentParaGuid) halfBakedCcwgItemsForCurrentPara.Add(kvp); } } var bldrSegmentsElement = new StringBuilder(); var numberOfOldSegmentsInCurrentPara = segsForCurrentPara.Values.Count; var currentOldSegmentIdx = 1; foreach (var currentOldSegInCurrentPara in segsForCurrentPara.Values) { var isLastOldSegment = (currentOldSegmentIdx++ == numberOfOldSegmentsInCurrentPara); var oldSegGuid = GetGuid(currentOldSegInCurrentPara); var guidOldSeg = new Guid(oldSegGuid); var newSegGuid = ccaGuidMap[guidOldSeg].ToString().ToLowerInvariant(); // Add it to Segments prop of currentParaElement, var objsur = DataMigrationServices.CreateOwningObjSurElement(newSegGuid); bldrSegmentsElement.AppendLine(objsur.ToString()); // Create new XElement for new segment. var newSegmentElement = new XElement("rt", new XAttribute("class", "Segment"), new XAttribute("guid", newSegGuid), new XAttribute("ownerguid", currentParaDto.Guid.ToLower()), new XElement("CmObject"), new XElement("Segment", AddBeginOffset(GetBeginOffset(currentOldSegInCurrentPara)), AddFreeTranslation(oldSegGuid, freeTrans), AddLiteralTranslation(oldSegGuid, litTrans), AddNotes(dtoRepos, newSegGuid, oldSegGuid, notes), AddSegmentAnalyses(dtoRepos, halfBakedCcwgItemsForCurrentPara, currentOldSegInCurrentPara, xficsForCurrentPara, oldTextTags, newSegGuid, isLastOldSegment, currentParaDto))); newSegmentElement = DeleteTemporaryAnalyses(newSegmentElement); // Create a new Segment instance DTO from the 'newSegmentElement', // and add it to repos. var newSegDto = new DomainObjectDTO(newSegGuid, "Segment", newSegmentElement.ToString()); dtoRepos.Add(newSegDto); } paraToOldSegments.Remove(currentParaGuid.ToLower()); paraToOldXfics.Remove(currentParaGuid.ToLower()); if (bldrSegmentsElement.Length == 0) continue; bldrSegmentsElement.Insert(0, "<Segments>"); bldrSegmentsElement.Append("</Segments>"); // Add paraSegmentsElement to current para. var segBytes = Encoding.UTF8.GetBytes(bldrSegmentsElement.ToString()); var xmlNew = new List<byte>(currentParaDto.XmlBytes.Length + segBytes.Length); xmlNew.AddRange(currentParaDto.XmlBytes); stTxtParaBounds = new ElementBounds(currentParaDto.XmlBytes, s_tagsStTxtPara); xmlNew.InsertRange(stTxtParaBounds.EndTagOffset, segBytes); // Tell DTO repos about the modification. DataMigrationServices.UpdateDTO(dtoRepos, currentParaDto, xmlNew.ToArray()); } }
private static string GetInstanceOfGuid(byte[] element) { var cmAnnotationBounds = new ElementBounds(element, s_tagsCmAnnotation); if (!cmAnnotationBounds.IsValid) return null; var instanceOfBounds = new ElementBounds(element, s_tagsInstanceOf, cmAnnotationBounds.BeginTagOffset, cmAnnotationBounds.EndTagOffset); if (!instanceOfBounds.IsValid) return null; return GetObjsurGuid(element, instanceOfBounds.BeginTagOffset, instanceOfBounds.EndTagOffset); }
private static string GetBeginObjectGuid(byte[] xmlBytes) { var baseBounds = new ElementBounds(xmlBytes, s_tagsCmBaseAnnotation); if (!baseBounds.IsValid) return null; var beginObjectBounds = new ElementBounds(xmlBytes, s_tagsBeginObject, baseBounds.BeginTagOffset, baseBounds.EndTagOffset); if (!beginObjectBounds.IsValid) return null; var objsurBounds = new ElementBounds(xmlBytes, s_tagsObjsur, beginObjectBounds.BeginTagOffset, beginObjectBounds.EndTagOffset); if (!objsurBounds.IsValid) return null; return GetGuid(xmlBytes, objsurBounds.BeginTagOffset, objsurBounds.EndTagOffset); }
private static void MarkParaAsNeedingTokenization(IDomainObjectDTORepository dtoRepos, DomainObjectDTO paraDto) { var stTxtParaBounds = new ElementBounds(paraDto.XmlBytes, s_tagsStTxtPara); if (!stTxtParaBounds.IsValid) return; // should never happen! var parseIsCurrentBounds = new ElementBounds(paraDto.XmlBytes, s_tagsParseIsCurrent, stTxtParaBounds.BeginTagOffset, stTxtParaBounds.EndTagOffset); List<byte> newXml; int ichInsert; if (parseIsCurrentBounds.IsValid) { var val = parseIsCurrentBounds.GetAttributeValue(s_valAttr); if (val.ToLowerInvariant() != "true") return; newXml = new List<byte>(paraDto.XmlBytes.Length + 5); newXml.AddRange(paraDto.XmlBytes); newXml.RemoveRange(parseIsCurrentBounds.BeginTagOffset, parseIsCurrentBounds.Length); ichInsert = parseIsCurrentBounds.BeginTagOffset; } else { newXml = new List<byte>(paraDto.XmlBytes.Length + 40); newXml.AddRange(paraDto.XmlBytes); ichInsert = stTxtParaBounds.EndTagOffset; } newXml.InsertRange(ichInsert, Encoding.UTF8.GetBytes("<ParseIsCurrent val=\"False\"/>")); // Tell DTO repos about the modification to the para. DataMigrationServices.UpdateDTO(dtoRepos, paraDto, newXml.ToArray()); }
private static void DeleteCcwgWithGuidIndex(IDomainObjectDTORepository dtoRepos) { const int kLongIntLength = 10; var goners = new List<DomainObjectDTO>(); var ccwgs = dtoRepos.AllInstancesSansSubclasses("ConstChartWordGroup"); foreach (var dto in ccwgs) { int dummy; var ebBegIndex = new ElementBounds(dto.XmlBytes, s_tagsBegAnalysisIndex); var ebEndIndex = new ElementBounds(dto.XmlBytes, s_tagsEndAnalysisIndex); if (!(ebBegIndex.IsValid && ebEndIndex.IsValid)) continue; // Hopefully this isn't a problem! var sBegAnalysisIndexValue = ebBegIndex.GetAttributeValue(s_valAttr); var sEndAnalysisIndexValue = ebEndIndex.GetAttributeValue(s_valAttr); if (!Int32.TryParse(sBegAnalysisIndexValue, out dummy) || !Int32.TryParse(sEndAnalysisIndexValue, out dummy)) { // Found something that needs fixing! // There's likely a guid instead of an integer, due to an unresolved // reference where the wfic it used to refer to has been deleted // for some (valid) reason. goners.Add(dto); continue; } } // need to remove goners from dtoRepos and make sure that empty rows get deleted too. if (goners.Count > 0) { var neededGoners = new Set<string>(); DeleteUnneededGoners(dtoRepos, goners, neededGoners); goners.Clear(); } }
private static List<string> GetAppliesToObjsurGuids(byte[] xmlBytes) { var retval = new List<string>(); var indirectBounds = new ElementBounds(xmlBytes, s_tagsCmIndirect); if (!indirectBounds.IsValid) return retval; var appliesToBounds = new ElementBounds(xmlBytes, s_tagsAppliesTo, indirectBounds.BeginTagOffset, indirectBounds.EndTagOffset); var ichNext = appliesToBounds.BeginTagOffset + s_tagsAppliesTo.BeginTag.Length; var objsurBounds = new ElementBounds(xmlBytes, s_tagsObjsur, appliesToBounds.BeginTagOffset, appliesToBounds.EndTagOffset); while (objsurBounds.IsValid) { var guid = GetGuid(xmlBytes, objsurBounds.BeginTagOffset, objsurBounds.EndTagOffset); if (!String.IsNullOrEmpty(guid)) retval.Add(guid.ToLowerInvariant()); objsurBounds.Reset(objsurBounds.EndTagOffset, appliesToBounds.EndTagOffset); } return retval; }
private static string GetObjsurGuid(byte[] xmlBytes, int ichMin, int ichLim) { var objsurBounds = new ElementBounds(xmlBytes, s_tagsObjsur, ichMin, ichLim); if (!objsurBounds.IsValid) return null; return GetGuid(xmlBytes, objsurBounds.BeginTagOffset, objsurBounds.EndTagOffset); }
private static XElement CreateATranslation(string oldSegGuid, Dictionary<string, List<byte[]>> inputPairs, string elementName) { List<byte[]> allOldTrans; if (!inputPairs.TryGetValue(oldSegGuid, out allOldTrans)) return null; // Nothing to move. if (allOldTrans == null || allOldTrans.Count == 0) return null; var firstOldTrans = allOldTrans[0]; // Move optional Comment from indirect ann to new element. var cmAnnotationBounds = new ElementBounds(firstOldTrans, s_tagsCmAnnotation); if (!cmAnnotationBounds.IsValid) return null; var commentBounds = new ElementBounds(firstOldTrans, s_tagsComment, cmAnnotationBounds.BeginTagOffset, cmAnnotationBounds.EndTagOffset); if (!commentBounds.IsValid) return null; var actualCommentNodes = GetAStrElements(firstOldTrans, commentBounds.BeginTagOffset, commentBounds.EndTagOffset); return new XElement(elementName, actualCommentNodes); }
private static string GetPreposedBooleanFromComment(byte[] xmlBytes, int ichMin, int ichLim) { var astrBounds = GetEnglishCommentBounds(xmlBytes, ichMin, ichLim); // If you can't find an English comment, take the first one you can find. if (astrBounds == null || !astrBounds.IsValid) { var commentBounds = new ElementBounds(xmlBytes, s_tagsComment, ichMin, ichLim); if (commentBounds.IsValid) { astrBounds = new ElementBounds(xmlBytes, s_tagsAStr, commentBounds.BeginTagOffset, commentBounds.EndTagOffset); } } if (astrBounds != null && astrBounds.IsValid) { var ichEndTag = xmlBytes.IndexOfSubArray(s_endXmlTag, astrBounds.BeginTagOffset); var ichLtlt = xmlBytes.IndexOfSubArray(s_ltlt, ichEndTag); if (ichLtlt > 0 && ichLtlt < astrBounds.EndTagOffset) return "True"; } return "False"; }
private static void PreprocessDiscourseAnnotation(ICollection<DomainObjectDTO> goners, ICollection<byte[]> annElements, DomainObjectDTO annDto, byte[] annElement) { goners.Add(annDto); var annoBounds = new ElementBounds(annElement, s_tagsCmAnnotation); var commentBounds = new ElementBounds(annElement, s_tagsComment, annoBounds.BeginTagOffset, annoBounds.EndTagOffset); var astrBounds = new ElementBounds(annElement, s_tagsAStr, commentBounds.BeginTagOffset, commentBounds.EndTagOffset); if (astrBounds.IsValid) annElements.Add(annElement); }
/// <summary> /// Constructor for an element to be found inside another element that was previously /// located in the byte array. /// </summary> public ElementBounds(byte[] xmlBytes, ElementTags tags, ElementBounds bounds) { m_xmlBytes = xmlBytes; m_tags = tags; Reset(bounds.BeginTagOffset, bounds.EndTagOffset); }
private static void PreprocessTranslationOrNoteAnnotation(ICollection<DomainObjectDTO> goners, Dictionary<string, List<byte[]>> annElements, DomainObjectDTO annDto, byte[] annElement) { goners.Add(annDto); var annoBounds = new ElementBounds(annElement, s_tagsCmAnnotation); var commentBounds = new ElementBounds(annElement, s_tagsComment, annoBounds.BeginTagOffset, annoBounds.EndTagOffset); var astrBounds = new ElementBounds(annElement, s_tagsAStr, commentBounds.BeginTagOffset, commentBounds.EndTagOffset); if (!astrBounds.IsValid) return; var appliesToElement = GetAppliesToObjsurGuids(annElement); if (appliesToElement != null && appliesToElement.Count > 0) { var guid = appliesToElement[0]; List<byte[]> elements = null; if (!annElements.TryGetValue(guid, out elements)) { elements = new List<byte[]>(); annElements[guid] = elements; } elements.Add(annElement); } }
private static ElementBounds GetEnglishCommentBounds(byte[] xmlBytes, int ichMin, int ichLim) { var commentBounds = new ElementBounds(xmlBytes, s_tagsComment, ichMin, ichLim); if (!commentBounds.IsValid) return null; var astrBounds = new ElementBounds(xmlBytes, s_tagsAStr, commentBounds.BeginTagOffset, commentBounds.EndTagOffset); while (astrBounds.IsValid) { var ws = astrBounds.GetAttributeValue(s_wsAttr); if (ws == "en") return astrBounds; astrBounds.Reset(astrBounds.EndTagOffset, commentBounds.EndTagOffset); } return null; }