public void Process(IDocument document) { if (document.Length == 0 || (document.Language != Language.Unknown && document.Language != Language.Any)) { return; } //Don't try to identify documents that already have their language set or is empty IDocument tempDocument = document; if (document.SpansCount == 0) // Have to tokenize temporarily the document { if (document.Length > 200) { tempDocument = new Document(document.Value.Substring(0, 200)); } else { tempDocument = new Document(document.Value); } Tokenizer.Process(tempDocument); } var tag = Model.PredictMax(tempDocument, 200); document.Language = Languages.CodeToEnum(tag.label); }
private IDocument Prepare(IDocument document) { IDocument tempDocument = document; if (document.SpansCount == 0) // Have to tokenize temporarily the document { if (document.Length > 1000) { tempDocument = new Document(document.Value.Substring(0, 1000)); } else { tempDocument = new Document(document.Value); } Tokenizer.Process(tempDocument); NumberNormalizer.Process(tempDocument); } return(tempDocument); }