public void testAdd() { TagExtractor extractor = new TagExtractor(null, 0); extractor.add(""); extractor.add(null); // ignore }
public void testNormalScenario() { TagExtractor extractor = new TagExtractor("abstract", 10); Assert.AreEqual(extractor.count(), 0); LangProfile profile = new LangProfile("en"); // normal extractor.setTag("abstract"); extractor.add("This is a sample text."); profile.update(extractor.closeTag()); Assert.AreEqual(extractor.count(), 1); Assert.AreEqual(profile.n_words[0], 17); // Thisisasampletext Assert.AreEqual(profile.n_words[1], 22); // _T, Th, hi, ... Assert.AreEqual(profile.n_words[2], 17); // _Th, Thi, his, ... // too short extractor.setTag("abstract"); extractor.add("sample"); profile.update(extractor.closeTag()); Assert.AreEqual(extractor.count(), 1); // other tags extractor.setTag("div"); extractor.add("This is a sample text which is enough long."); profile.update(extractor.closeTag()); Assert.AreEqual(extractor.count(), 1); }
internal List <Tag> PopulateTags() { var tag = new Tag { Tags = new List <Tag>(), MyRuntimeObject = MyCharacter, MyParentTag = null, ParentTagId = Guid.Empty, Id = Guid.NewGuid(), TagName = "Reflection" }; var tags = TagExtractor.ExtractTagsFromAttributes(MyCharacter, tag); var found = from a in MySINnerFile.SiNnerMetaData.Tags.ToList() where a.TagName == "Reflection" select a; foreach (var f in found) { MySINnerFile.SiNnerMetaData.Tags.Remove(f); } MySINnerFile.SiNnerMetaData.Tags.Add(tag); foreach (var childtag in MySINnerFile.SiNnerMetaData.Tags) { childtag.SetSinnerIdRecursive(MySINnerFile.Id); } return(MySINnerFile.SiNnerMetaData.Tags.ToList()); }
public void testSetTag() { TagExtractor extractor = new TagExtractor(null, 0); extractor.setTag(""); Assert.AreEqual(extractor.tag_, ""); extractor.setTag(null); Assert.AreEqual(extractor.tag_, null); }
void MainWindow_Loaded(object sender, RoutedEventArgs e) { _repeatedFilesFinder = new RepeatedFilesFinder(); _repeatedFilesFinder.StatusChanged += _repeatedFilesFinder_StatusChanged; _tagExtractor = new TagExtractor(); _tagExtractor.StatusChanged += _tagExtractor_StatusChanged; _folderBrowser = new System.Windows.Forms.FolderBrowserDialog(); }
public void testTagExtractor() { TagExtractor extractor = new TagExtractor(null, 0); Assert.AreEqual(extractor.target_, null); Assert.AreEqual(extractor.threshold_, 0); TagExtractor extractor2 = new TagExtractor("abstract", 10); Assert.AreEqual(extractor2.target_, "abstract"); Assert.AreEqual(extractor2.threshold_, 10); }
/// <summary> /// Maps the dto property values to the persisted model /// </summary> /// <typeparam name="TPersisted"></typeparam> /// <param name="contentItem"></param> protected virtual void MapPropertyValues <TPersisted>(ContentBaseItemSave <TPersisted> contentItem) where TPersisted : IContentBase { //Map the property values foreach (var property in contentItem.ContentDto.Properties) { //get the dbo property var dboProperty = contentItem.PersistedContent.Properties[property.Alias]; //create the property data to send to the property editor var dictionary = new Dictionary <string, object>(); //add the files if any var files = contentItem.UploadedFiles.Where(x => x.PropertyAlias == property.Alias).ToArray(); if (files.Length > 0) { dictionary.Add("files", files); } foreach (var file in files) { file.FileName = file.FileName.ToSafeFileName(); } // add extra things needed to figure out where to put the files dictionary.Add("cuid", contentItem.PersistedContent.Key); dictionary.Add("puid", dboProperty.PropertyType.Key); var data = new ContentPropertyData(property.Value, property.PreValues, dictionary); //get the deserialized value from the property editor if (property.PropertyEditor == null) { LogHelper.Warn <ContentController>("No property editor found for property " + property.Alias); } else { var valueEditor = property.PropertyEditor.ValueEditor; //don't persist any bound value if the editor is readonly if (valueEditor.IsReadOnly == false) { var propVal = property.PropertyEditor.ValueEditor.ConvertEditorToDb(data, dboProperty.Value); var supportTagsAttribute = TagExtractor.GetAttribute(property.PropertyEditor); if (supportTagsAttribute != null) { TagExtractor.SetPropertyTags(dboProperty, data, propVal, supportTagsAttribute); } else { dboProperty.Value = propVal; } } } } }
public void testClear() { TagExtractor extractor = new TagExtractor("abstract", 10); extractor.setTag("abstract"); extractor.add("This is a sample text."); Assert.AreEqual(extractor.buf_.ToString(), "This is a sample text."); Assert.AreEqual(extractor.tag_, "abstract"); extractor.clear(); Assert.AreEqual(extractor.buf_.ToString(), ""); Assert.AreEqual(extractor.tag_, null); }
/// <summary> /// Load Wikipedia abstract database file and generate its language profile /// </summary> /// <param name="lang">target language name</param> /// <param name="file">target database file path</param> /// <returns>Language profile instance</returns> /// <exception cref="LangDetectException" /> public static LangProfile loadFromWikipediaAbstract(string lang, string file) { LangProfile profile = new LangProfile(lang); FileInfo fi = new FileInfo(file); Stream _is = null; try { _is = fi.OpenRead(); if (fi.Name.EndsWith(".gz")) { _is = new GZipStream(_is, CompressionMode.Decompress); } using (StreamReader br = new StreamReader(_is, System.Text.Encoding.UTF8)) { TagExtractor tagextractor = new TagExtractor("abstract", 100); using (XmlReader reader = XmlReader.Create(br)) { while (reader.Read()) { switch (reader.NodeType) { case XmlNodeType.Element: tagextractor.setTag(reader.Name); break; case XmlNodeType.Text: tagextractor.add(reader.Value); break; case XmlNodeType.EndElement: string text = tagextractor.closeTag(); if (text != null) { profile.update(text); } break; } } } } } finally { if (null != _is) { _is.Close(); _is.Dispose(); } } return(profile); }
internal Generator( ISentenceTemplateProvider sentenceProvider, IWordSelectorFactory wordSelectorFactory, IStringFormatter stringFormatter) { _sentenceTemplateProvider = sentenceProvider; _wordSelectorFactory = wordSelectorFactory; _stringFormatter = stringFormatter; _tagExtractor = new TagExtractor( tagFactory: new TagFactory()); }
public static LangProfile load(string lang, string file) { LangProfile profile = new LangProfile(lang); TagExtractor tagextractor = new TagExtractor("abstract", 100); Stream inputStream = null; try { inputStream = File.OpenRead(file); string extension = Path.GetExtension(file) ?? ""; if (extension.ToUpper() == ".GZ") { inputStream = new GZipStream(inputStream, CompressionMode.Decompress); } using (XmlReader xmlReader = XmlReader.Create(inputStream)) { while (xmlReader.Read()) { switch (xmlReader.NodeType) { case XmlNodeType.Element: tagextractor.SetTag(xmlReader.Name); break; case XmlNodeType.Text: tagextractor.Add(xmlReader.Value); break; case XmlNodeType.EndElement: tagextractor.CloseTag(profile); break; } } } } finally { if (inputStream != null) { inputStream.Close(); } } Console.WriteLine(lang + ": " + tagextractor.Count); return(profile); }
public static void DoSomething_WorksButNotIdeal() { MusicTrack track1 = new MusicTrack(); MusicTrack track2 = new MusicTrack(); TagSetter setter = new TagSetter(); TagExtractor extractor = new TagExtractor(); // Set some values on the tracks foreach (IExistsTag tag in track1.Tags.Values) { tag.Apply(setter); // do stuff using base interface if necessary ITag itag = tag.Apply(extractor); } }
internal IList <Tag> PopulateTags() { var tag = new Tag { MyRuntimeObject = MyCharacter, SiNnerId = MySINnerFile.Id, TagName = "Reflection" }; tag.Tags.AddRange(TagExtractor.ExtractTagsFromAttributes(MyCharacter)); foreach (var f in MySINnerFile.SiNnerMetaData.Tags.Where(x => x?.TagName == "Reflection").ToList()) { MySINnerFile.SiNnerMetaData.Tags.Remove(f); } MySINnerFile.SiNnerMetaData.Tags.Add(tag); foreach (var childtag in MySINnerFile.SiNnerMetaData.Tags) { childtag.SetSinnerIdRecursive(MySINnerFile.Id); } return(MySINnerFile.SiNnerMetaData.Tags.ToList()); }
protected IDictionary <int, PropertyCollection> GetPropertyCollection( Sql docSql, IEnumerable <DocumentDefinition> documentDefs) { if (documentDefs.Any() == false) { return(new Dictionary <int, PropertyCollection>()); } //we need to parse the original SQL statement and reduce the columns to just cmsContent.nodeId, cmsContentVersion.VersionId so that we can use // the statement to go get the property data for all of the items by using an inner join var parsedOriginalSql = "SELECT {0} " + docSql.SQL.Substring(docSql.SQL.IndexOf("FROM", StringComparison.Ordinal)); //now remove everything from an Orderby clause and beyond if (parsedOriginalSql.InvariantContains("ORDER BY ")) { parsedOriginalSql = parsedOriginalSql.Substring(0, parsedOriginalSql.LastIndexOf("ORDER BY ", StringComparison.Ordinal)); } var propSql = new Sql(@"SELECT cmsPropertyData.* FROM cmsPropertyData INNER JOIN cmsPropertyType ON cmsPropertyData.propertytypeid = cmsPropertyType.id INNER JOIN (" + string.Format(parsedOriginalSql, "cmsContent.nodeId, cmsContentVersion.VersionId") + @") as docData ON cmsPropertyData.versionId = docData.VersionId AND cmsPropertyData.contentNodeId = docData.nodeId LEFT OUTER JOIN cmsDataTypePreValues ON cmsPropertyType.dataTypeId = cmsDataTypePreValues.datatypeNodeId", docSql.Arguments); var allPropertyData = Database.Fetch <PropertyDataDto>(propSql); //This is a lazy access call to get all prevalue data for the data types that make up all of these properties which we use // below if any property requires tag support var allPreValues = new Lazy <IEnumerable <DataTypePreValueDto> >(() => { var preValsSql = new Sql(@"SELECT a.id, a.value, a.sortorder, a.alias, a.datatypeNodeId FROM cmsDataTypePreValues a WHERE EXISTS( SELECT DISTINCT b.id as preValIdInner FROM cmsDataTypePreValues b INNER JOIN cmsPropertyType ON b.datatypeNodeId = cmsPropertyType.dataTypeId INNER JOIN (" + string.Format(parsedOriginalSql, "DISTINCT cmsContent.contentType") + @") as docData ON cmsPropertyType.contentTypeId = docData.contentType WHERE a.id = b.id)", docSql.Arguments); return(Database.Fetch <DataTypePreValueDto>(preValsSql)); }); var result = new Dictionary <int, PropertyCollection>(); var propertiesWithTagSupport = new Dictionary <string, SupportTagsAttribute>(); //iterate each definition grouped by it's content type - this will mean less property type iterations while building // up the property collections foreach (var compositionGroup in documentDefs.GroupBy(x => x.Composition)) { var compositionProperties = compositionGroup.Key.CompositionPropertyTypes.ToArray(); foreach (var def in compositionGroup) { var propertyDataDtos = allPropertyData.Where(x => x.NodeId == def.Id).Distinct(); var propertyFactory = new PropertyFactory(compositionProperties, def.Version, def.Id, def.CreateDate, def.VersionDate); var properties = propertyFactory.BuildEntity(propertyDataDtos.ToArray()).ToArray(); var newProperties = properties.Where(x => x.HasIdentity == false && x.PropertyType.HasIdentity); foreach (var property in newProperties) { var propertyDataDto = new PropertyDataDto { NodeId = def.Id, PropertyTypeId = property.PropertyTypeId, VersionId = def.Version }; int primaryKey = Convert.ToInt32(Database.Insert(propertyDataDto)); property.Version = def.Version; property.Id = primaryKey; } foreach (var property in properties) { //NOTE: The benchmarks run with and without the following code show very little change so this is not a perf bottleneck var editor = PropertyEditorResolver.Current.GetByAlias(property.PropertyType.PropertyEditorAlias); var tagSupport = propertiesWithTagSupport.ContainsKey(property.PropertyType.PropertyEditorAlias) ? propertiesWithTagSupport[property.PropertyType.PropertyEditorAlias] : TagExtractor.GetAttribute(editor); if (tagSupport != null) { //add to local cache so we don't need to reflect next time for this property editor alias propertiesWithTagSupport[property.PropertyType.PropertyEditorAlias] = tagSupport; //this property has tags, so we need to extract them and for that we need the prevals which we've already looked up var preValData = allPreValues.Value.Where(x => x.DataTypeNodeId == property.PropertyType.DataTypeDefinitionId) .Distinct() .ToArray(); var asDictionary = preValData.ToDictionary(x => x.Alias, x => new PreValue(x.Id, x.Value, x.SortOrder)); var preVals = new PreValueCollection(asDictionary); var contentPropData = new ContentPropertyData(property.Value, preVals, new Dictionary <string, object>()); TagExtractor.SetPropertyTags(property, contentPropData, property.Value, tagSupport); } } if (result.ContainsKey(def.Id)) { Logger.Warn <VersionableRepositoryBase <TId, TEntity> >("The query returned multiple property sets for document definition " + def.Id + ", " + def.Composition.Name); } result[def.Id] = new PropertyCollection(properties); } } return(result); }
public void testCloseTag() { TagExtractor extractor = new TagExtractor(null, 0); extractor.closeTag(); // ignore }
public async Task GenerateApiTemplates_ProperlyLaysTheInformation() { // arrange var currentTestDirectory = Path.Combine(this.OutputDirectory, nameof(GenerateApiTemplates_ProperlyLaysTheInformation)); var extractorConfig = this.GetMockedExtractorConsoleAppConfiguration( splitApis: false, apiVersionSetName: string.Empty, multipleApiNames: string.Empty, includeAllRevisions: false); var extractorParameters = new ExtractorParameters(extractorConfig); // mocked clients var mockedApiClient = MockApisClient.GetMockedApiClientWithDefaultValues(); var mockedProductClient = MockProductsClient.GetMockedApiClientWithDefaultValues(); var mockedApiSchemaClient = MockApiSchemaClient.GetMockedApiClientWithDefaultValues(); var mockedPolicyClient = MockPolicyClient.GetMockedApiClientWithDefaultValues(); var mockedTagClient = MockTagClient.GetMockedApiClientWithDefaultValues(); var mockedApiOperationClient = MockApiOperationClient.GetMockedApiClientWithDefaultValues(); var mockedDiagnosticClient = MockDiagnosticClient.GetMockedClientWithApiDependentValues(); // mocked extractors var mockedDiagnosticExtractor = new DiagnosticExtractor(this.GetTestLogger <DiagnosticExtractor>(), mockedDiagnosticClient); var mockedApiSchemaExtractor = new ApiSchemaExtractor(this.GetTestLogger <ApiSchemaExtractor>(), mockedApiSchemaClient); var mockedPolicyExtractor = new PolicyExtractor(this.GetTestLogger <PolicyExtractor>(), mockedPolicyClient, new TemplateBuilder()); var mockedProductApisExtractor = new ProductApisExtractor(this.GetTestLogger <ProductApisExtractor>(), mockedProductClient, mockedApiClient, new TemplateBuilder()); var mockedTagExtractor = new TagExtractor(this.GetTestLogger <TagExtractor>(), mockedTagClient, new TemplateBuilder()); var mockedApiOperationExtractor = new ApiOperationExtractor(this.GetTestLogger <ApiOperationExtractor>(), mockedApiOperationClient); var apiExtractor = new ApiExtractor( this.GetTestLogger <ApiExtractor>(), new TemplateBuilder(), mockedApiClient, mockedDiagnosticExtractor, mockedApiSchemaExtractor, mockedPolicyExtractor, mockedProductApisExtractor, mockedTagExtractor, mockedApiOperationExtractor); var extractorExecutor = ExtractorExecutor.BuildExtractorExecutor( this.GetTestLogger <ExtractorExecutor>(), apiExtractor: apiExtractor); extractorExecutor.SetExtractorParameters(extractorParameters); // act var apiTemplate = await extractorExecutor.GenerateApiTemplateAsync( singleApiName : It.IsAny <string>(), multipleApiNames : It.IsAny <List <string> >(), currentTestDirectory); // assert File.Exists(Path.Combine(currentTestDirectory, apiTemplate.TypedResources.FileName)).Should().BeTrue(); Directory.GetFiles(Path.Combine(currentTestDirectory, PolicyExtractor.PoliciesDirectoryName)).Count().Should().Be(4); apiTemplate.Parameters.Should().NotBeNull(); apiTemplate.Parameters.Should().ContainKey(ParameterNames.ApimServiceName); apiTemplate.Parameters.Should().ContainKey(ParameterNames.ServiceUrl); apiTemplate.Parameters.Should().ContainKey(ParameterNames.ApiLoggerId); apiTemplate.Parameters.Should().ContainKey(ParameterNames.PolicyXMLBaseUrl); apiTemplate.Parameters.Should().ContainKey(ParameterNames.PolicyXMLSasToken); apiTemplate.Resources.Count().Should().Be(23); // apis apiTemplate.TypedResources.Apis.Count().Should().Be(2); apiTemplate.TypedResources.Apis.All(x => x.Type == ResourceTypeConstants.API).Should().BeTrue(); apiTemplate.TypedResources.Apis.All(x => x.Properties is not null).Should().BeTrue(); // api schemas apiTemplate.TypedResources.ApiSchemas.Count().Should().Be(2); apiTemplate.TypedResources.ApiSchemas.All(x => x.Type == ResourceTypeConstants.APISchema).Should().BeTrue(); apiTemplate.TypedResources.ApiSchemas.All(x => x.Properties is not null).Should().BeTrue(); // diagnostics apiTemplate.TypedResources.Diagnostics.Count().Should().Be(3); apiTemplate.TypedResources.Diagnostics.All(x => x.Type == ResourceTypeConstants.APIServiceDiagnostic || x.Type == ResourceTypeConstants.APIDiagnostic).Should().BeTrue(); apiTemplate.TypedResources.Diagnostics.All(x => x.Properties is not null).Should().BeTrue(); // tags apiTemplate.TypedResources.Tags.Count().Should().Be(4); apiTemplate.TypedResources.Tags.All(x => x.Type == ResourceTypeConstants.ProductTag).Should().BeTrue(); // api products apiTemplate.TypedResources.ApiProducts.Count().Should().Be(2); apiTemplate.TypedResources.ApiProducts.All(x => x.Type == ResourceTypeConstants.ProductApi).Should().BeTrue(); apiTemplate.TypedResources.ApiProducts.All(x => x.Properties is not null).Should().BeTrue(); // api policies apiTemplate.TypedResources.ApiPolicies.Count().Should().Be(2); apiTemplate.TypedResources.ApiPolicies.All(x => x.Properties is not null).Should().BeTrue(); // api operations apiTemplate.TypedResources.ApiOperations.Count().Should().Be(2); apiTemplate.TypedResources.ApiOperations.All(x => x.Type == ResourceTypeConstants.APIOperation).Should().BeTrue(); apiTemplate.TypedResources.ApiOperations.All(x => x.Properties is not null).Should().BeTrue(); apiTemplate.TypedResources.ApiOperations.SelectMany(x => x.DependsOn).Any(x => x.Contains($"'{ResourceTypeConstants.API}'")).Should().BeTrue(); apiTemplate.TypedResources.ApiOperations.SelectMany(x => x.DependsOn).Any(x => x.Contains($"'{ResourceTypeConstants.APIOperation}'")).Should().BeFalse(); // api operations policies apiTemplate.TypedResources.ApiOperationsPolicies.Count().Should().Be(2); apiTemplate.TypedResources.ApiOperations.All(x => x.Properties is not null).Should().BeTrue(); // api operations tags apiTemplate.TypedResources.ApiOperationsPolicies.Count().Should().Be(2); apiTemplate.TypedResources.ApiOperations.All(x => x.Properties is not null).Should().BeTrue(); }
public object FromPostModel(FluiditySectionConfig section, FluidityCollectionConfig collection, FluidityEntityPostModel postModel, object entity) { var editorProps = collection.Editor.Tabs.SelectMany(x => x.Fields).ToArray(); // Update the name property if (collection.NameProperty != null) { entity.SetPropertyValue(collection.NameProperty, postModel.Name); } // Update the individual properties foreach (var prop in postModel.Properties) { // Get the prop config var propConfig = editorProps.First(x => x.Property.Name == prop.Alias); if (!propConfig.IsReadOnly) { // Create additional data for file handling var additionalData = new Dictionary <string, object>(); // Grab any uploaded files and add them to the additional data var files = postModel.UploadedFiles.Where(x => x.PropertyAlias == prop.Alias).ToArray(); if (files.Length > 0) { additionalData.Add("files", files); } // Ensure safe filenames foreach (var file in files) { file.FileName = file.FileName.ToSafeFileName(); } // Add extra things needed to figure out where to put the files // Looking into the core code, these are not actually used for any lookups, // rather they are used to generate a unique path, so we just use the nearest // equivilaants from the fluidity api. var cuid = $"{section.Alias}_{collection.Alias}_{entity.GetPropertyValue(collection.IdProperty)}"; var puid = $"{section.Alias}_{collection.Alias}_{propConfig.Property.Name}"; additionalData.Add("cuid", ObjectExtensions.EncodeAsGuid(cuid)); additionalData.Add("puid", ObjectExtensions.EncodeAsGuid(puid)); var dataTypeInfo = _dataTypeHelper.ResolveDataType(propConfig, collection.IsReadOnly); var data = new ContentPropertyData(prop.Value, dataTypeInfo.PreValues, additionalData); if (!dataTypeInfo.PropertyEditor.ValueEditor.IsReadOnly) { var currentValue = entity.GetPropertyValue(propConfig.Property); var encryptedProp = collection.EncryptedProperties?.FirstOrDefault(x => x.Name == propConfig.Property.Name); if (encryptedProp != null) { currentValue = SecurityHelper.Decrypt(currentValue.ToString()); } if (propConfig.ValueMapper != null) { currentValue = propConfig.ValueMapper.ModelToEditor(currentValue); } var propVal = dataTypeInfo.PropertyEditor.ValueEditor.ConvertEditorToDb(data, currentValue); var supportTagsAttribute = TagExtractor.GetAttribute(dataTypeInfo.PropertyEditor); if (supportTagsAttribute != null) { var dummyProp = new Property(new PropertyType(dataTypeInfo.DataTypeDefinition), propVal); TagExtractor.SetPropertyTags(dummyProp, data, propVal, supportTagsAttribute); propVal = dummyProp.Value; } if (propConfig.ValueMapper != null) { propVal = propConfig.ValueMapper.EditorToModel(propVal); } if (encryptedProp != null) { propVal = SecurityHelper.Encrypt(propVal.ToString()); } if (propVal != null && propVal.GetType() != propConfig.Property.Type) { var convert = propVal.TryConvertTo(propConfig.Property.Type); if (convert.Success) { propVal = convert.Result; } } entity.SetPropertyValue(propConfig.Property, propVal); } } } return(entity); }
/// <summary> /// Gets the property collection for a query /// </summary> /// <param name="pagingSqlQuery"></param> /// <param name="documentDefs"></param> /// <returns></returns> protected IDictionary <Guid, PropertyCollection> GetPropertyCollection( PagingSqlQuery pagingSqlQuery, IReadOnlyCollection <DocumentDefinition> documentDefs) { if (documentDefs.Count == 0) { return(new Dictionary <Guid, PropertyCollection>()); } //initialize to the query passed in var docSql = pagingSqlQuery.PrePagedSql; //we need to parse the original SQL statement and reduce the columns to just cmsContent.nodeId, cmsContentVersion.VersionId so that we can use // the statement to go get the property data for all of the items by using an inner join var parsedOriginalSql = "SELECT {0} " + docSql.SQL.Substring(docSql.SQL.IndexOf("FROM", StringComparison.Ordinal)); if (pagingSqlQuery.HasPaging) { //if this is a paged query, build the paged query with the custom column substitution, then re-assign docSql = pagingSqlQuery.BuildPagedQuery("{0}"); parsedOriginalSql = docSql.SQL; } else if (parsedOriginalSql.InvariantContains("ORDER BY ")) { //now remove everything from an Orderby clause and beyond if this is unpaged data parsedOriginalSql = parsedOriginalSql.Substring(0, parsedOriginalSql.LastIndexOf("ORDER BY ", StringComparison.Ordinal)); } //This retrieves all pre-values for all data types that are referenced for all property types // that exist in the data set. //Benchmarks show that eagerly loading these so that we can lazily read the property data // below (with the use of Query intead of Fetch) go about 30% faster, so we'll eagerly load // this now since we cannot execute another reader inside of reading the property data. var preValsSql = new Sql(@"SELECT a.id, a.value, a.sortorder, a.alias, a.datatypeNodeId FROM cmsDataTypePreValues a WHERE EXISTS( SELECT DISTINCT b.id as preValIdInner FROM cmsDataTypePreValues b INNER JOIN cmsPropertyType ON b.datatypeNodeId = cmsPropertyType.dataTypeId INNER JOIN (" + string.Format(parsedOriginalSql, "cmsContent.contentType") + @") as docData ON cmsPropertyType.contentTypeId = docData.contentType WHERE a.id = b.id)", docSql.Arguments); var allPreValues = Database.Fetch <DataTypePreValueDto>(preValsSql); //It's Important with the sort order here! We require this to be sorted by node id, // this is required because this data set can be huge depending on the page size. Due // to it's size we need to be smart about iterating over the property values to build // the document. Before we used to use Linq to get the property data for a given content node // and perform a Distinct() call. This kills performance because that would mean if we had 7000 nodes // and on each iteration we will perform a lookup on potentially 100,000 property rows against the node // id which turns out to be a crazy amount of iterations. Instead we know it's sorted by this value we'll // keep an index stored of the rows being read so we never have to re-iterate the entire data set // on each document iteration. var propSql = new Sql(@"SELECT cmsPropertyData.* FROM cmsPropertyData INNER JOIN cmsPropertyType ON cmsPropertyData.propertytypeid = cmsPropertyType.id INNER JOIN (" + string.Format(parsedOriginalSql, "cmsContent.nodeId, cmsContentVersion.VersionId") + @") as docData ON cmsPropertyData.versionId = docData.VersionId AND cmsPropertyData.contentNodeId = docData.nodeId ORDER BY contentNodeId, versionId, propertytypeid ", docSql.Arguments); //This does NOT fetch all data into memory in a list, this will read // over the records as a data reader, this is much better for performance and memory, // but it means that during the reading of this data set, nothing else can be read // from SQL server otherwise we'll get an exception. var allPropertyData = Database.Query <PropertyDataDto>(propSql); var result = new Dictionary <Guid, PropertyCollection>(); var propertiesWithTagSupport = new Dictionary <string, SupportTagsAttribute>(); //used to track the resolved composition property types per content type so we don't have to re-resolve (ToArray) the list every time var resolvedCompositionProperties = new Dictionary <int, PropertyType[]>(); //keep track of the current property data item being enumerated var propertyDataSetEnumerator = allPropertyData.GetEnumerator(); var hasCurrent = false; // initially there is no enumerator.Current var comparer = new DocumentDefinitionComparer(SqlSyntax); try { //This must be sorted by node id because this is how we are sorting the query to lookup property types above, // which allows us to more efficiently iterate over the large data set of property values foreach (var def in documentDefs.OrderBy(x => x.Id).ThenBy(x => x.Version, comparer)) { // get the resolved properties from our local cache, or resolve them and put them in cache PropertyType[] compositionProperties; if (resolvedCompositionProperties.ContainsKey(def.Composition.Id)) { compositionProperties = resolvedCompositionProperties[def.Composition.Id]; } else { compositionProperties = def.Composition.CompositionPropertyTypes.ToArray(); resolvedCompositionProperties[def.Composition.Id] = compositionProperties; } // assemble the dtos for this def // use the available enumerator.Current if any else move to next var propertyDataDtos = new List <PropertyDataDto>(); while (hasCurrent || propertyDataSetEnumerator.MoveNext()) { //Not checking null on VersionId because it can never be null - no idea why it's set to nullable // ReSharper disable once PossibleInvalidOperationException if (propertyDataSetEnumerator.Current.VersionId.Value == def.Version) { hasCurrent = false; // enumerator.Current is not available propertyDataDtos.Add(propertyDataSetEnumerator.Current); } else { hasCurrent = true; // enumerator.Current is available for another def break; // no more propertyDataDto for this def } } var properties = PropertyFactory.BuildEntity(propertyDataDtos, compositionProperties, def.CreateDate, def.VersionDate).ToArray(); foreach (var property in properties) { //NOTE: The benchmarks run with and without the following code show very little change so this is not a perf bottleneck var editor = PropertyEditorResolver.Current.GetByAlias(property.PropertyType.PropertyEditorAlias); var tagSupport = propertiesWithTagSupport.ContainsKey(property.PropertyType.PropertyEditorAlias) ? propertiesWithTagSupport[property.PropertyType.PropertyEditorAlias] : TagExtractor.GetAttribute(editor); if (tagSupport != null) { //add to local cache so we don't need to reflect next time for this property editor alias propertiesWithTagSupport[property.PropertyType.PropertyEditorAlias] = tagSupport; //this property has tags, so we need to extract them and for that we need the prevals which we've already looked up var preValData = allPreValues.Where(x => x.DataTypeNodeId == property.PropertyType.DataTypeDefinitionId) .Distinct() .ToArray(); var asDictionary = preValData.ToDictionary(x => x.Alias, x => new PreValue(x.Id, x.Value, x.SortOrder)); var preVals = new PreValueCollection(asDictionary); var contentPropData = new ContentPropertyData(property.Value, preVals); TagExtractor.SetPropertyTags(property, contentPropData, property.Value, tagSupport); } } if (result.ContainsKey(def.Version)) { var msg = string.Format("The query returned multiple property sets for document definition {0}, {1}, {2}", def.Id, def.Version, def.Composition.Name); if (ThrowOnWarning) { throw new InvalidOperationException(msg); } else { Logger.Warn <VersionableRepositoryBase <TId, TEntity> >(msg); } } result[def.Version] = new PropertyCollection(properties); } } finally { propertyDataSetEnumerator.Dispose(); } return(result); }
/// <summary> /// Load Wikipedia abstract database file and generate its language profile /// </summary> /// <param name="lang">target language name</param> /// <param name="file">target database file path</param> /// <returns>Language profile instance</returns> /// <exception>LangDetectException </exception> public static LangProfile LoadFromWikipediaAbstract(string lang, string file) { LangProfile profile = new LangProfile(lang); StreamReader br = null; try { Stream strm = File.OpenRead(file); if (file.EndsWith(".gz")) { strm = new GZipStream(strm, CompressionMode.Decompress); } br = new StreamReader(strm); TagExtractor tagextractor = new TagExtractor("abstract", 100); XmlReader reader = XmlReader.Create(br); try { while (reader.Read()) { switch (reader.NodeType) { case XmlNodeType.Element: tagextractor.SetTag(reader.Name); break; case XmlNodeType.Text: tagextractor.Add(reader.Value); break; case XmlNodeType.EndElement: string text = tagextractor.CloseTag(); if (text != null) { profile.Update(text); } break; } } } catch (XmlException e) { throw new LangDetectException(ErrorCode.TrainDataFormatError, "Training database file '" + file + "' is an invalid XML."); } finally { try { if (reader != null) { reader.Close(); } } catch (XmlException e) { } } Console.WriteLine(lang + ":" + tagextractor.Count()); } catch (IOException e) { throw new LangDetectException(ErrorCode.CantOpenTrainData, "Can't open training database file '" + file + "'"); } finally { try { if (br != null) { br.Close(); } } catch (IOException e) { } } return(profile); }