static bool GenerateFile(Options options) { AudioBitsPerSample bitRate = options.BitRate == 1 ? AudioBitsPerSample.Eight : AudioBitsPerSample.Sixteen; AudioChannel channel = options.Channel == 1 ? AudioChannel.Stereo : AudioChannel.Mono; var format = new SpeechAudioFormatInfo(options.SampleRate, bitRate, channel); string text = ""; if (String.IsNullOrEmpty(options.Text)) { text = File.ReadAllText(options.InputFile, Encoding.UTF8); } else { text = options.Text; } string path = String.IsNullOrEmpty(options.Folder) ? options.Path : String.Format("{0}/{1}", options.Folder, options.Path); SpeechSynthesizer speaker = new SpeechSynthesizer(); try { speaker.SelectVoice(options.VoiceName); speaker.SetOutputToWaveFile(path, format); speaker.Speak(text); } catch (Exception) { return(false); } return(true); }
public void Start(RecognitionConfig config) { _config = config.Windows; _stream = new PipeStream(9600); if (_speechEngine == null) { _speechEngine = new SpeechRecognitionEngine(); _speechEngine.LoadGrammar(new DictationGrammar()); _speechEngine.SpeechHypothesized += OnSpeechHypothesized; _speechEngine.SpeechRecognized += OnSpeechRecognized; } var format = new SpeechAudioFormatInfo(48000, AudioBitsPerSample.Sixteen, AudioChannel.Mono); _speechEngine.SetInputToAudioStream(_stream, format); if (!_recognizing) { _recognizing = true; _speaking = false; _speechEngine.RecognizeAsync(RecognizeMode.Multiple); } }
private void SetOutputStream(Stream stream, SpeechAudioFormatInfo formatInfo, bool headerInfo, bool closeStreamOnExit) { SetOutputToNull(); _outputStream = stream; _closeStreamOnExit = closeStreamOnExit; VoiceSynthesizer.SetOutput(stream, formatInfo, headerInfo); }
public SAPI() { int frequency = 16000; speechRate = 0; speechAudioFormatInfo = new SpeechAudioFormatInfo(frequency, AudioBitsPerSample.Sixteen, AudioChannel.Mono); }
public void SetOutputToWaveFile(string path, SpeechAudioFormatInfo formatInfo) { Helpers.ThrowIfEmptyOrNull(path, "path"); Helpers.ThrowIfNull(formatInfo, "formatInfo"); SetOutputToNull(); SetOutputStream(new FileStream(path, FileMode.Create, FileAccess.Write), formatInfo, true, true); }
/// <summary> /// Convert XML to WAV bytes. WAV won't have the header, so you have to add it separately. /// </summary> static byte[] ConvertSsmlXmlToWav(string voiceId, string xml, WaveFormat format) { using (var ms = new MemoryStream()) { using (var synthesizer = new SpeechSynthesizer()) { //var format = new SpeechAudioFormatInfo( if (format != null) { //var bps = format.BitsPerSample == 8 ? AudioBitsPerSample.Eight : AudioBitsPerSample.Sixteen; var blockAlignment = format.BitsPerSample / 8 * format.Channels; var averagerBytesPerSecond = format.SampleRate * format.BitsPerSample / 8 * format.Channels; var formatInfo = new SpeechAudioFormatInfo(EncodingFormat.Pcm, format.SampleRate, format.BitsPerSample, format.Channels, averagerBytesPerSecond, blockAlignment, new byte[0]); // Returns WAV data only. synthesizer.SetOutputToAudioStream(ms, formatInfo); } try { var voice = synthesizer.GetInstalledVoices().Cast <InstalledVoice>().FirstOrDefault(x => x.VoiceInfo.Id == voiceId); synthesizer.SelectVoice(voice.VoiceInfo.Name); synthesizer.SpeakSsml(xml); return(ms.ToArray()); } catch (Exception ex) { ex.Data.Add("Voice", "voiceName"); OnEvent(Exception, ex); } } } return(null); }
public void SetOutputToAudioStream(Stream audioDestination, SpeechAudioFormatInfo formatInfo) { Helpers.ThrowIfNull(audioDestination, nameof(audioDestination)); Helpers.ThrowIfNull(formatInfo, nameof(formatInfo)); SetOutputStream(audioDestination, formatInfo, false, false); }
public void Initialise(Speech words) { if (WindowKinect.Device == null) { return; } WordsToWatch = words; var commands = words.GetCommands(new Choices()); commands.Add(new SemanticResultValue("sleep", SleepKey)); commands.Add(new SemanticResultValue("wake", WakeUpKey)); var speechAudioFormat = new SpeechAudioFormatInfo(EncodingFormat.Pcm, 16000, 16, 1, 32000, 2, null); var grammerConstructor = new GrammarBuilder(); grammerConstructor.Culture = RecogniserInfo.Culture; grammerConstructor.Append(commands); SpeechEngine = new SpeechRecognitionEngine(RecogniserInfo.Id); SpeechEngine.LoadGrammar(new Grammar(grammerConstructor)); SpeechEngine.SpeechRecognized += SpeeckRecognized; SpeechEngine.UpdateRecognizerSetting("AdaptationOn", 0); SpeechEngine.SetInputToAudioStream(WindowKinect.Device.AudioSource.Start(), speechAudioFormat); SpeechEngine.RecognizeAsync(RecognizeMode.Multiple); Active = WindowKinect.Device.ElevationAngle > 0; }
public QueuedSpeechSynthesizer(ref MixingSampleProvider mspStandard, ref MixingSampleProvider mspLoopback, int volume, int rate) { synthesizer = new SpeechSynthesizer(); this.volume = volume; this.rate = rate; queue = new ObservableCollection <QSSQueueItem>(); this.mspStandard = mspStandard; this.mspLoopback = mspLoopback; int samplerate = int.Parse(File.ReadAllLines("audioformat.txt")[0].Replace("Sample rate: ", "")); int channels = int.Parse(File.ReadAllLines("audioformat.txt")[1].Replace("Channels: ", "")); if (channels == 1) { synthesizerAudioFormat = new SpeechAudioFormatInfo(samplerate, AudioBitsPerSample.Sixteen, AudioChannel.Mono); } else { synthesizerAudioFormat = new SpeechAudioFormatInfo(samplerate, AudioBitsPerSample.Sixteen, AudioChannel.Stereo); } waveFormat = new WaveFormat(samplerate, channels); startSpeakLoop(); }
void InitializeSpeechEngine(SpeechRecognitionEngine sre) { // Log function entrance TraceLog.TraceFunction(); try { // initialize and cache format info formatInfo = new SpeechAudioFormatInfo(defaultSampleRate, defaultBitsPerSample, defaultAudioChannels); // initialize and cache speech engine sre.UpdateRecognizerSetting("AssumeCFGFromTrustedSource", 1); string fileName = @"TELLME-SMS-LM.cfgp"; string appDataPath = HttpContext.Current.Server.MapPath("~/Content/grammars"); string grammarPath = Path.Combine(appDataPath, fileName); TraceLog.TraceInfo("Grammar path: " + grammarPath); // make sure the grammar files are copied over from the approot directory to the appDataPath InitializeGrammar(grammarPath, appDataPath, fileName); // initialize and load the grammar Grammar grammar = new Grammar(grammarPath); grammar.Enabled = true; sre.LoadGrammar(grammar); } catch (Exception ex) { TraceLog.TraceError("Speech Engine initialization failed: " + ex.Message); } }
public void SpeakMessage(AudioVideoFlow flow, string message) { try { SpeechSynthesizer synth = new SpeechSynthesizer(); SpeechAudioFormatInfo formatInfo = new SpeechAudioFormatInfo(16000, AudioBitsPerSample.Sixteen, Microsoft.Speech.AudioFormat.AudioChannel.Mono); SpeechSynthesisConnector connector = new SpeechSynthesisConnector(); synth.SetOutputToAudioStream(connector.Stream, formatInfo); connector.AttachFlow(flow); connector.Start(); synth.SpeakCompleted += new EventHandler<SpeakCompletedEventArgs>( (sender, args) => { connector.Stop(); synth.Dispose(); }); synth.SpeakAsync(message); } catch (Exception ex) { Console.WriteLine("Failed to play the message. {0}", ex); } }
private void StartSpeech(AssignedVoice vb, string outputfile) { WinAvailableVoice wv = (WinAvailableVoice)vb.root; // Find the best audio format to use for this voice. System.Collections.ObjectModel.ReadOnlyCollection <SpeechAudioFormatInfo> formats = wv.winVoice.VoiceInfo.SupportedAudioFormats; format = formats.FirstOrDefault(); if (format == null) { // The voice did not tell us its parameters, so we pick some. format = new SpeechAudioFormatInfo( 16000, // Samples per second AudioBitsPerSample.Sixteen, AudioChannel.Mono); } // First set up to synthesize the message into a WAV file. mstream = new FileStream(outputfile, FileMode.Create, FileAccess.Write); syn.SetOutputToWaveStream(mstream); pb = new PromptBuilder(); mainStyle = new PromptStyle(); // mainStyle.Volume = promptVol; syn.SelectVoice(wv.winVoice.VoiceInfo.Name); pb.StartStyle(mainStyle); }
/// <summary> /// Create an instance of AudioFileOut. /// </summary> internal AudioFileOut(Stream stream, SpeechAudioFormatInfo formatInfo, bool headerInfo, IAsyncDispatch asyncDispatch) { _asyncDispatch = asyncDispatch; _stream = stream; _startStreamPosition = _stream.Position; _hasHeader = headerInfo; _wfxOut = new WAVEFORMATEX(); // if we have a formatInfo object, format conversion may be necessary if (formatInfo != null) { // Build the Wave format from the formatInfo _wfxOut.wFormatTag = (short)formatInfo.EncodingFormat; _wfxOut.wBitsPerSample = (short)formatInfo.BitsPerSample; _wfxOut.nSamplesPerSec = formatInfo.SamplesPerSecond; _wfxOut.nChannels = (short)formatInfo.ChannelCount; } else { // Set the default values _wfxOut = WAVEFORMATEX.Default; } _wfxOut.nBlockAlign = (short)(_wfxOut.nChannels * _wfxOut.wBitsPerSample / 8); _wfxOut.nAvgBytesPerSec = _wfxOut.wBitsPerSample * _wfxOut.nSamplesPerSec * _wfxOut.nChannels / 8; }
public MeWantSpeech() { // convert text to audio stream using .net 3.x speechsynthesis g.711 u-law (pcm 64kb/s bit rate (u-law encodes 14-bit to 8-bit samples by adding 32 / binary 100000) _speechSynthesizer = new SpeechSynthesizer(); // select (if it exists) _speechSynthesizer.SelectVoiceByHints(VoiceGender.Female, VoiceAge.Adult); // can also change voice with SelectVoice method _speechSynthesizer.Rate = 1; // encoding format enums are Pcm, ALaw, ULaw int samplesPerSecond = 8000; int bitsPerSample = 8; // System.Speech.AudioFormat.AudioBitsPerSample.Eight int channelCount = 1; // System.Speech.AudioFormat.AudioChannel.Mono int averageBytesPerSecond = 20; int blockAlign = 2; byte[] formatSpecificData = null; _formatInfo = new SpeechAudioFormatInfo(EncodingFormat.ULaw, samplesPerSecond, bitsPerSample, channelCount, averageBytesPerSecond, blockAlign, formatSpecificData); }
private void CreateWAV(string fileName, string verbiage) { using (Stream ret = new MemoryStream()) using (SpeechSynthesizer synth = new SpeechSynthesizer()) { var mi = synth.GetType().GetMethod("SetOutputStream", BindingFlags.Instance | BindingFlags.NonPublic); // var fmt = new SpeechAudioFormatInfo(8000, AudioBitsPerSample.Eight, AudioChannel.Mono); var fmt = new SpeechAudioFormatInfo(EncodingFormat.ULaw, 8000, 8, 1, 16000, 2, null); mi.Invoke(synth, new object[] { ret, fmt, true, true }); synth.SelectVoiceByHints(VoiceGender.Female, VoiceAge.Adult); synth.Speak(verbiage); // Testing code: using (var fs = new FileStream(fileName, FileMode.Create, FileAccess.Write, FileShare.None)) { ret.Position = 0; byte[] buffer = new byte[4096]; for (; ;) { int len = ret.Read(buffer, 0, buffer.Length); if (len == 0) { break; } fs.Write(buffer, 0, len); } } } }
public string RecognizeSpeech(byte[] speechToParse, int sampleRate) { SpeechRecognitionEngine sre = new SpeechRecognitionEngine(); if (_grammar == null) { InitializeGrammar(); } sre.LoadGrammar(_grammar); MemoryStream ms = new MemoryStream(speechToParse); var formatInfo = new SpeechAudioFormatInfo(sampleRate, AudioBitsPerSample.Sixteen, AudioChannel.Mono); sre.SetInputToAudioStream(ms, formatInfo); var result = sre.Recognize(); ms = null; if (result == null) { return("Unable to recognize speech"); } else { return(result.Text); } }
protected void InitSpeechEngine(bool def) { try { WSRConfig cfg = WSRConfig.GetInstance(); WSRSpeechManager manager = WSRSpeechManager.GetInstance(); // File manager.InitEngines(); // Default if (def) { manager.AddDefaultEngine("Default", cfg.language, cfg.confidence); } // RTP if (rtpClient == null) { return; } var format = new SpeechAudioFormatInfo(16000, AudioBitsPerSample.Sixteen, AudioChannel.Stereo); manager.AddEngine("RTP", cfg.language, cfg.confidence, rtpClient.AudioStream, format); } catch (Exception ex) { WSRConfig.GetInstance().logError("ENGINE", "InitEngines: " + ex.Message); } }
void Flow_StateChanged(object sender, MediaFlowStateChangedEventArgs e) { Log("ControlAVCall Flow_StateChanged PreviousState=" + e.PreviousState + " State=" + e.State); AudioVideoFlow avFlow = (AudioVideoFlow)sender; if (avFlow.State == MediaFlowState.Active) { SpeechRecognitionConnector speechRecognitionConnector = new SpeechRecognitionConnector(); speechRecognitionConnector.AttachFlow(avFlow); SpeechRecognitionStream stream = speechRecognitionConnector.Start(); _speechRecognitionEngine = new SpeechRecognitionEngine(); _speechRecognitionEngine.SpeechRecognized += new EventHandler <SpeechRecognizedEventArgs>(_speechRecognitionEngine_SpeechRecognized); _speechRecognitionEngine.LoadGrammarCompleted += new EventHandler <LoadGrammarCompletedEventArgs>(_speechRecognitionEngine_LoadGrammarCompleted); Choices pathChoice = new Choices(new string[] { "previous", "next" }); Grammar gr = new Grammar(new GrammarBuilder(pathChoice)); _speechRecognitionEngine.LoadGrammarAsync(gr); SpeechAudioFormatInfo speechAudioFormatInfo = new SpeechAudioFormatInfo(8000, AudioBitsPerSample.Sixteen, Microsoft.Speech.AudioFormat.AudioChannel.Mono); _speechRecognitionEngine.SetInputToAudioStream(stream, speechAudioFormatInfo); _speechRecognitionEngine.RecognizeAsync(RecognizeMode.Multiple); } else { if (avFlow.SpeechRecognitionConnector != null) { avFlow.SpeechRecognitionConnector.DetachFlow(); } } }
public static void SpeakInWave(string text, string wavefile) { try { if (Initialized) { var fmt = new SpeechAudioFormatInfo(8000, AudioBitsPerSample.Eight, AudioChannel.Mono); SPS.SetOutputToWaveFile(wavefile, fmt); SPS.Speak(text); SPS.SetOutputToDefaultAudioDevice(); } else { Initialize(); var fmt = new SpeechAudioFormatInfo(8000, AudioBitsPerSample.Eight, AudioChannel.Mono); SPS.SetOutputToWaveFile(wavefile, fmt); SPS.Speak(text); SPS.SetOutputToDefaultAudioDevice(); } } catch (Exception ex) { } finally { } }
public void SetInputToAudioStream(Stream audioSource, SpeechAudioFormatInfo audioFormat) { Helpers.ThrowIfNull(audioSource, nameof(audioSource)); Helpers.ThrowIfNull(audioFormat, nameof(audioFormat)); RecoBase.SetInput(audioSource, audioFormat); }
/// <summary> /// Creates a new speech recognition engine. /// </summary> /// <returns>A new speech recognition engine object.</returns> private SpeechRecognitionEngine CreateSpeechRecognitionEngine() { // Create speech recognition engine var recognizer = SystemSpeech.CreateSpeechRecognitionEngine(this.Configuration.Language, this.Configuration.Grammars); // Attach event handlers for speech recognition events recognizer.AudioStateChanged += this.OnAudioStateChanged; recognizer.RecognizeCompleted += this.OnRecognizeCompleted; // Create the format info from the configuration input format SpeechAudioFormatInfo formatInfo = new SpeechAudioFormatInfo( (EncodingFormat)this.Configuration.InputFormat.FormatTag, (int)this.Configuration.InputFormat.SamplesPerSec, this.Configuration.InputFormat.BitsPerSample, this.Configuration.InputFormat.Channels, (int)this.Configuration.InputFormat.AvgBytesPerSec, this.Configuration.InputFormat.BlockAlign, (this.Configuration.InputFormat is WaveFormatEx) ? ((WaveFormatEx)this.Configuration.InputFormat).ExtraInfo : null); // Specify the input stream and audio format recognizer.SetInputToAudioStream(this.inputAudioStream, formatInfo); // Set the speech recognition engine parameters recognizer.InitialSilenceTimeout = TimeSpan.FromMilliseconds(this.Configuration.InitialSilenceTimeoutMs); recognizer.BabbleTimeout = TimeSpan.FromMilliseconds(this.Configuration.BabbleTimeoutMs); recognizer.EndSilenceTimeout = TimeSpan.FromMilliseconds(this.Configuration.EndSilenceTimeoutMs); recognizer.EndSilenceTimeoutAmbiguous = TimeSpan.FromMilliseconds(this.Configuration.EndSilenceTimeoutAmbiguousMs); return(recognizer); }
public void Run() { // Create AudioVideoFlow AudioVideoFlowHelper audioVideoFlowHelper = new AudioVideoFlowHelper(); _audioVideoFlow = audioVideoFlowHelper.CreateAudioVideoFlow( null, audioVideoFlow_StateChanged); // Create a speech synthesis connector and attach it to an AudioVideoFlow SpeechSynthesisConnector speechSynthesisConnector = new SpeechSynthesisConnector(); speechSynthesisConnector.AttachFlow(_audioVideoFlow); // Create a speech synthesis and set connector to it SpeechSynthesizer speechSynthesis = new SpeechSynthesizer(); SpeechAudioFormatInfo audioformat = new SpeechAudioFormatInfo(16000, AudioBitsPerSample.Sixteen, Microsoft.Speech.AudioFormat.AudioChannel.Mono); speechSynthesis.SetOutputToAudioStream(speechSynthesisConnector, audioformat); //Load readme file as the source Console.WriteLine(); Console.Write("Please enter the source file => "); string filename = Console.ReadLine(); string msg = ""; try { StreamReader objReader = new StreamReader(filename); msg = objReader.ReadToEnd(); } catch (FileNotFoundException) { Console.WriteLine("\r\nFile doesn't exist."); ShutdownPlatform(); } //Start connector speechSynthesisConnector.Start(); Console.WriteLine("\r\nStreaming source file for 15 seconds."); //Start streaming from speech synthesis. speechSynthesis.SpeakAsync(new Prompt(msg)); //Allow the connector to stream 15 seconds by waiting for 15 seconds Thread.Sleep(15000); //Stop the connector speechSynthesisConnector.Stop(); Console.WriteLine("\r\nSpeech synthesis connector stopped."); //speech synthesis connector must be detached from the flow, otherwise if the connector is rooted, it will keep the flow in memory. speechSynthesisConnector.DetachFlow(); // Shutdown the platform ShutdownPlatform(); _waitForShutdownEventCompleted.WaitOne(); }
/// <summary> /// Creates a new speech recognition engine /// </summary> /// <returns>A new speech recognition engine object.</returns> private SpeechRecognitionEngine CreateSpeechRecognitionEngine() { // Create the recognizer var recognizer = SystemSpeech.CreateSpeechRecognitionEngine(this.Configuration.Language, this.Configuration.Grammars); // Attach event handlers for speech recognition events recognizer.SpeechDetected += this.OnSpeechDetected; recognizer.SpeechHypothesized += this.OnSpeechHypothesized; recognizer.SpeechRecognized += this.OnSpeechRecognized; recognizer.SpeechRecognitionRejected += this.OnSpeechRecognitionRejected; recognizer.AudioSignalProblemOccurred += this.OnAudioSignalProblemOccurred; recognizer.AudioStateChanged += this.OnAudioStateChanged; recognizer.RecognizeCompleted += this.OnRecognizeCompleted; recognizer.RecognizerUpdateReached += this.OnRecognizerUpdateReached; recognizer.AudioLevelUpdated += this.OnAudioLevelUpdated; recognizer.EmulateRecognizeCompleted += this.OnEmulateRecognizeCompleted; recognizer.LoadGrammarCompleted += this.OnLoadGrammarCompleted; // Create the format info from the configuration input format SpeechAudioFormatInfo formatInfo = new SpeechAudioFormatInfo( (EncodingFormat)this.Configuration.InputFormat.FormatTag, (int)this.Configuration.InputFormat.SamplesPerSec, this.Configuration.InputFormat.BitsPerSample, this.Configuration.InputFormat.Channels, (int)this.Configuration.InputFormat.AvgBytesPerSec, this.Configuration.InputFormat.BlockAlign, (this.Configuration.InputFormat is WaveFormatEx) ? ((WaveFormatEx)this.Configuration.InputFormat).ExtraInfo : null); // Specify the input stream and audio format recognizer.SetInputToAudioStream(this.inputAudioStream, formatInfo); return(recognizer); }
public bool InitSTT(string recognizerID = null) { try { Console.Write("InitSTT"); Initialized = false; var RecognizerInfoLit = SpeechRecognitionEngine.InstalledRecognizers(); _recognition = new SpeechRecognitionEngine(new CultureInfo("en-US")); _recognition.LoadGrammar(new Grammar(new GrammarBuilder("exit"))); _recognition.LoadGrammar(new DictationGrammar()); loadAdditionalGrammer(_recognition); //_recognition.BabbleTimeout = new TimeSpan(0); // _recognition.InitialSilenceTimeout = new TimeSpan(0); _recognition.SpeechHypothesized += recognition_SpeechHypothesized; _recognition.SpeechRecognized += recognition_SpeechRecognized; _recognition.SpeechDetected += recognition_SpeechDetected; _recognition.RecognizeCompleted += recognition_RecognizeCompleted; _recognition.SpeechRecognitionRejected += (recognition_SpeechRecognizedRejected); _speechFormat = new SpeechAudioFormatInfo(_audioFormat.SampleRate, (AudioBitsPerSample)_audioFormat.BitRate, (AudioChannel)_audioFormat.Channels); //_recognition.UnloadAllGrammars(); Initialized = true; return(true); } catch (Exception ex) { Console.WriteLine(ex); } return(false); }
public void Run() { // A helper class to take care of platform and endpoint setup and cleanup. _helper = new UCMASampleHelper(); // Create a user endpoint using the network credential object. _userEndpoint = _helper.CreateEstablishedUserEndpoint("Broadcast User"); // Register a delegate to be called when an incoming audio-video call arrives. _userEndpoint.RegisterForIncomingCall <AudioVideoCall>(AudioVideoCall_Received); // Wait for the incoming call to be accepted. Console.WriteLine("Waiting for incoming call..."); _waitForCallToBeAccepted.WaitOne(); // Create a speech recognition connector and attach an AudioVideoFlow to it. SpeechRecognitionConnector speechRecognitionConnector = new SpeechRecognitionConnector(); speechRecognitionConnector.AttachFlow(_audioVideoFlow); // Start the speech recognition connector. SpeechRecognitionStream stream = speechRecognitionConnector.Start(); // Create a speech recognition engine. SpeechRecognitionEngine speechRecognitionEngine = new SpeechRecognitionEngine(); speechRecognitionEngine.SpeechRecognized += new EventHandler <SpeechRecognizedEventArgs>(SpeechRecognitionEngine_SpeechRecognized); //Add a grammar. string[] recoString = { "buy", "sell", "Fabrikam", "Contoso", "maximum", "minimum", "one", "ten", "twenty", "send" }; Choices choices = new Choices(recoString); speechRecognitionEngine.LoadGrammar(new Grammar(new GrammarBuilder(choices))); //Attach to audio stream to the SR engine. SpeechAudioFormatInfo speechAudioFormatInfo = new SpeechAudioFormatInfo(8000, AudioBitsPerSample.Sixteen, Microsoft.Speech.AudioFormat.AudioChannel.Mono); speechRecognitionEngine.SetInputToAudioStream(stream, speechAudioFormatInfo); Console.WriteLine("\r\nGrammar loaded, say send to send IM."); //Prepare the SR engine to perform multiple asynchronous recognitions. speechRecognitionEngine.RecognizeAsync(RecognizeMode.Multiple); //Pause the main thread until recognition completes. _waitForConnectorToStop.WaitOne(); speechRecognitionConnector.Stop(); Console.WriteLine("connector stopped"); // Detach the flow from the speech recognition connector, to prevent the flow from being kept in memory. speechRecognitionConnector.DetachFlow(); // Terminate the call, the conversation, and then unregister the // endpoint from receiving an incoming call. _audioVideoCall.BeginTerminate(CallTerminateCB, _audioVideoCall); _waitForConversationToBeTerminated.WaitOne(); // Shut down the platform. _helper.ShutdownPlatform(); }
internal RecognizedAudio(byte[] rawAudioData, SpeechAudioFormatInfo audioFormat, DateTime startTime, TimeSpan audioPosition, TimeSpan audioDuration) { _audioFormat = audioFormat; _startTime = startTime; _audioPosition = audioPosition; _audioDuration = audioDuration; _rawAudioData = rawAudioData; }
public void UseAudioQueue() { Debug.WriteLine("SR is using queued stream"); ebookStream = new EbookStream(ref conQueue); SpeechAudioFormatInfo info = new SpeechAudioFormatInfo(44100, AudioBitsPerSample.Sixteen, AudioChannel.Mono); recEngine.SetInputToAudioStream(ebookStream, info); }
public static string GetBase64Audio(string textInput) { var speechAudioFormatConfig = new SpeechAudioFormatInfo(samplesPerSecond: 8000, bitsPerSample: AudioBitsPerSample.Sixteen, channel: AudioChannel.Stereo); var waveFormat = new WaveFormat(speechAudioFormatConfig.SamplesPerSecond, speechAudioFormatConfig.BitsPerSample, speechAudioFormatConfig.ChannelCount); var prompt = new PromptBuilder { Culture = CultureInfo.CreateSpecificCulture("en-US") }; prompt.StartVoice(prompt.Culture); prompt.StartSentence(); prompt.StartStyle(new PromptStyle() { Emphasis = PromptEmphasis.Reduced, Rate = PromptRate.Slow }); prompt.AppendText(textInput); prompt.EndStyle(); prompt.EndSentence(); prompt.EndVoice(); var mp3Stream = new MemoryStream(); byte[] audioOutputBytes; string audioOutputAsString = null; using (var synthWaveMemoryStream = new MemoryStream()) { var resetEvent = new ManualResetEvent(false); ThreadPool.QueueUserWorkItem(arg => { try { var siteSpeechSynth = new SpeechSynthesizer(); siteSpeechSynth.SetOutputToAudioStream(synthWaveMemoryStream, speechAudioFormatConfig); siteSpeechSynth.Speak(prompt); } finally { resetEvent.Set(); } }); WaitHandle.WaitAll(new WaitHandle[] { resetEvent }); var bitRate = (speechAudioFormatConfig.AverageBytesPerSecond * 8); synthWaveMemoryStream.Position = 0; using (var mp3FileWriter = new LameMP3FileWriter(outStream: mp3Stream, format: waveFormat, bitRate: bitRate)) { synthWaveMemoryStream.CopyTo(mp3FileWriter); } audioOutputBytes = mp3Stream.ToArray(); audioOutputAsString = $"data:audio/mp3;base64,{Convert.ToBase64String(audioOutputBytes)}"; } return(audioOutputAsString); }
private void SetOutputStream(Stream stream, SpeechAudioFormatInfo formatInfo, bool headerInfo, bool closeStreamOnExit) { SetOutputToNull(); _outputStream = stream; _closeStreamOnExit = closeStreamOnExit; // Need to serialize into a proper wav file before closing the stream VoiceSynthesizer.SetOutput(stream, formatInfo, headerInfo); }
public void printDebugFormatInfo(SpeechAudioFormatInfo speechAudioFormatInfoToPrint) { System.Diagnostics.Debug.WriteLine("Samples per second: " + speechAudioFormatInfoToPrint.SamplesPerSecond); System.Diagnostics.Debug.WriteLine("Average bytes per second: " + speechAudioFormatInfoToPrint.AverageBytesPerSecond); System.Diagnostics.Debug.WriteLine("Bits per sample: " + speechAudioFormatInfoToPrint.BitsPerSample); System.Diagnostics.Debug.WriteLine("Channel count: " + speechAudioFormatInfoToPrint.ChannelCount); System.Diagnostics.Debug.WriteLine("Encoding format: " + speechAudioFormatInfoToPrint.EncodingFormat); System.Diagnostics.Debug.WriteLine("Block Align: " + speechAudioFormatInfoToPrint.BlockAlign); }
public WSRSpeechEngine AddEngine(String prefix, String language, double confidence, Stream source, SpeechAudioFormatInfo format) { WSRSpeechEngine engine = new WSRSpeechEngine(prefix, language, confidence); engine.LoadGrammar(); engine.Init(); engine.GetEngine().SetInputToAudioStream(source, format); engine.Start(); Engines.Add(prefix, engine); return engine; }
public WindowsSpeech() { speechRecognizer = new SpeechRecognitionEngine(); Grammar grammar = new Grammar(new GrammarBuilder("one two")); speechRecognizer.LoadGrammar(grammar); formatInfo = new SpeechAudioFormatInfo(16000, AudioBitsPerSample.Sixteen, AudioChannel.Mono); speechRecognizer.SpeechRecognized += new EventHandler <SpeechRecognizedEventArgs>(SpeechRecognizedHandler); }
public double computeLogEnergy(short[] audioArray, SpeechAudioFormatInfo speechAudioFormatInfo) { // Compute power by summing the squares of the signal. double energy = 0.0; for (int i = 0; i < audioArray.Length; ++i) energy += (double)audioArray[i] * (double)audioArray[i]; energy /= audioArray.Length; energy = Math.Log(energy); // System.Diagnostics.Debug.WriteLine("energy[]: " + energy); return energy; }
/// <summary> /// Класс генератора звука. /// </summary> /// <param name="aVoice">Выбранный голос из системы.</param> /// <param name="aRate">Скорость воспроизведения.</param> /// <param name="aSamplePerSecond">Сэмплов в секунду.</param> public VoiceSynthesizer(string aVoice, int aRate = 0, int aSamplePerSecond = 22050) { if (string.IsNullOrEmpty(aVoice)) { throw new ArgumentNullException("aVoice"); } if (aRate < -10 || aRate > 10) { throw new ArgumentException("Rate must be in the interval [-10; 10]"); } generator = new SpeechSynthesizer(); generator.SelectVoice(aVoice); generator.Rate = aRate; audioFormat = new SpeechAudioFormatInfo(aSamplePerSecond, AudioBitsPerSample.Sixteen, AudioChannel.Stereo); }
public void AddSpeechEngine(Stream stream, string format, String device, String language, double confidence) { language = (language == null) ? ConfigManager.GetInstance().Find("bot.language", "fr-FR") : language; var info = new SpeechAudioFormatInfo(16000, AudioBitsPerSample.Sixteen, AudioChannel.Stereo); if ("Kinect".Equals(format)) { info = new SpeechAudioFormatInfo(EncodingFormat.Pcm, 16000, 16, 1, 32000, 2, null); } SpeechEngine engine = new SpeechEngine(device, language, confidence); engine.Load(GrammarManager.GetInstance().Cache, false); engine.Init(); engine.Engine.SetInputToAudioStream(stream, info); engine.Start(); Engines.Add(device, engine); }
public string RecognizeSpeech(byte[] speechToParse, int sampleRate) { SpeechRecognitionEngine sre = new SpeechRecognitionEngine(); if (_grammar == null) InitializeGrammar(); sre.LoadGrammar(_grammar); MemoryStream ms = new MemoryStream(speechToParse); var formatInfo = new SpeechAudioFormatInfo(sampleRate, AudioBitsPerSample.Sixteen, AudioChannel.Mono); sre.SetInputToAudioStream(ms, formatInfo); var result = sre.Recognize(); ms = null; if (result == null) return "Unable to recognize speech"; else return result.Text; }
static void Main(string[] args) { var text = args.Length > 0 ? args[0] : "enter some text as first argument to this command"; // Initialize a new instance of the SpeechSynthesizer. SpeechSynthesizer synth = new SpeechSynthesizer(); // Configure the audio output. if (args.Length > 1) { var audioformat = new SpeechAudioFormatInfo(16000,AudioBitsPerSample.Sixteen, AudioChannel.Mono); synth.SetOutputToWaveFile(args[1], audioformat); } else { synth.SetOutputToDefaultAudioDevice(); } // Speak a string. synth.Speak(text); }
private void button1_Click(object sender, EventArgs e) { if (string.IsNullOrEmpty(textBox1.Text)) { MessageBox.Show(this, "Write some text to start.", "Empty text", MessageBoxButtons.OK, MessageBoxIcon.Asterisk); return; } string[] texts = textBox1.Text.Split('\n'); progressBar1.Value = 0; progressBar1.Maximum = texts.Length; for (int i = 0; i < texts.Length; i++) { var reader = new SpeechSynthesizer(); //reader.SpeakCompleted += new EventHandler<SpeakCompletedEventArgs>(synth_SpeakCompleted); reader.Rate = trackRate.Value; reader.Volume = trackVolume.Value; reader.SelectVoice(((VoiceInfo)cmbVoice.SelectedItem).Name); var bits = radio8Bits.Checked ? AudioBitsPerSample.Eight : AudioBitsPerSample.Sixteen; var channel = radioChannelMono.Checked ? AudioChannel.Mono : AudioChannel.Stereo; var format = new SpeechAudioFormatInfo(int.Parse(cmbSamples.Text), bits, channel); string filePath = Directory.GetCurrentDirectory() + @"\Output\"; if (!Directory.Exists(filePath)) { Directory.CreateDirectory(filePath); } reader.SetOutputToWaveFile(Directory.GetCurrentDirectory() + @"\Output\" + GetAudioFileName(texts[i]), format); reader.Speak(GetAudioText(texts[i])); progressBar1.Value++; reader.Dispose(); } MessageBox.Show(this, "All done. Check .wav files on 'Output' folder.", "Finish", MessageBoxButtons.OK, MessageBoxIcon.Information); }
public void Generate(List<string> p_aryNames, List<string> p_aryLines, string p_strPath, int p_nRate, AudioBitsPerSample p_samples, AudioChannel p_channels) { SpeechAudioFormatInfo t_audioFormatInfo = new SpeechAudioFormatInfo(p_nRate, p_samples, p_channels); SpeechSynthesizer t_synth = new SpeechSynthesizer(); progressBar1.Maximum = p_aryLines.Count; progressBar1.Step = 1; label1.Text = progressBar1.Step + "/" + p_aryNames.Count; for (int t_i = 0; t_i < p_aryNames.Count; ++t_i) { t_synth.SetOutputToWaveFile(p_strPath + "\\" + p_aryNames[t_i] + ".wav"); t_synth.Speak(p_aryLines[t_i]); label1.Text = (t_i + 1) + "/" + p_aryLines.Count; progressBar1.PerformStep(); progressBar1.Refresh(); } t_synth.Dispose(); Close(); }
public void SetInputToAudioStream( Stream audioSource, SpeechAudioFormatInfo audioFormat ) { engine.SetInputToAudioStream( audioSource, audioFormat ); }
protected void InitSpeechEngine(bool def) { try { WSRConfig cfg = WSRConfig.GetInstance(); WSRSpeechManager manager = WSRSpeechManager.GetInstance(); // File manager.InitEngines(); // Default if (def){ manager.AddDefaultEngine("Default", cfg.language, cfg.confidence); } // RTP if (rtpClient == null) { return; } var format = new SpeechAudioFormatInfo(16000, AudioBitsPerSample.Sixteen, AudioChannel.Stereo); manager.AddEngine("RTP", cfg.language, cfg.confidence, rtpClient.AudioStream, format); } catch (Exception ex) { WSRConfig.GetInstance().logError("ENGINE", "InitEngines: " + ex.Message); } }
void StartAudioStream() { sensor.Start(); //Console.WriteLine("TomL: " + sensor.IsRunning); _kinectSource = sensor.AudioSource; _kinectSource.AutomaticGainControlEnabled = true; _kinectSource.EchoCancellationMode = EchoCancellationMode.None; _kinectSource.BeamAngleMode = BeamAngleMode.Adaptive; Console.WriteLine(" lakjsdflajsdlfjla " + _kinectSource.ToString()); _stream = _kinectSource.Start(); SpeechAudioFormatInfo bleh = new SpeechAudioFormatInfo(EncodingFormat.Pcm, 16000, 16, 1, 32000, 2, null) ; _speechEngine.SetInputToAudioStream(_kinectSource.Start(), bleh); _speechEngine.RecognizeAsync(RecognizeMode.Multiple); }
void InitializeRecognitionEngine(string cultureName = "en-US") { RecognizerInfo ri = TryGetKinectRecognizer(cultureName); if(ri==null) { throw new Exception("No Recognizer"); } recognitionEngine = new SpeechRecognitionEngine(ri.Id); recognitionEngine.SpeechRecognized += recognitionEngine_SpeechRecognized; SpeechAudioFormatInfo speechAudioFormatInfo = new SpeechAudioFormatInfo(EncodingFormat.Pcm, 16000, 16, 1, 32000, 2, null); recognitionEngine.SetInputToAudioStream(convertStream, speechAudioFormatInfo); }
// Analyzes the audio currently in the buffer and estimates the fundamental frequency. public int extractPitch(short[] audioArray, SpeechAudioFormatInfo speechAudioFormatInfo) { System.Diagnostics.Debug.WriteLine("SpeechEmotionRecognitionEngine::extractPitch()"); if (audioArray == null) { System.Diagnostics.Debug.WriteLine("audioArray is null"); return -1; } // For Debugging. // printDebugFormatInfo(speechAudioFormatInfo); // Only allow 8 or 16 bit audio. if (speechAudioFormatInfo.BitsPerSample != 8 && speechAudioFormatInfo.BitsPerSample != 16) { System.Diagnostics.Debug.WriteLine("Invalid BitsPerSample"); return -1; } // To detect the pitch, we take a window of the signal, with a length at least twice as long // as the longest period that we might detect. If the sampling rate is 44,100 Hz, this // corresponded to a length of 1200 samples. For effecient calculation, I use this ratio to approximate // the windowSize. This give a ratio of 36.75, which I rounded down to 36. int windowSize = 2048; // speechAudioFormatInfo.SamplesPerSecond / 36; int numWindows = audioArray.Length / (windowSize * speechAudioFormatInfo.ChannelCount); double[][] correlationFunctions = new double[numWindows][]; int[] estimatedFundamentalFrequencies = new int[numWindows]; for (int windowIndex = 0; windowIndex < numWindows; ++windowIndex) { // Store the current window in inputAudio so we can work with it. short[] inputAudio = new short[windowSize]; for (int i = 0; i < windowSize; ++i) inputAudio[i] = audioArray[i + windowIndex * windowSize]; // Calculate the correlation function. correlationFunctions[windowIndex] = correlation(inputAudio, speechAudioFormatInfo); // Clip all results below 0 to 0. for (int i = 0; i < windowSize / 2; ++i) { if (correlationFunctions[windowIndex][i] < 0) correlationFunctions[windowIndex][i] = 0; } // Stretch correlation results by a factor of 2 and subtract from the original signal. for (int i = 0; i < windowSize / 2; ++i) { int value; if (i % 2 == 0) value = inputAudio[i / 2]; else value = (inputAudio[i / 2 + 1] - inputAudio[i / 2]) / 2; correlationFunctions[windowIndex][i] -= value; } // Clip all results below 0 to 0. for (int i = 0; i < windowSize / 2; ++i) { if (correlationFunctions[windowIndex][i] < 0) correlationFunctions[windowIndex][i] = 0; } // Finally, estimate fundamental frequency. estimatedFundamentalFrequencies[windowIndex] = estimateF0(correlationFunctions[windowIndex], speechAudioFormatInfo); } // Calculate the average frequency over all the windows. // Can this overflow? Nah, probably not. long tmp = estimatedFundamentalFrequencies[0]; for (int windowIndex = 1; windowIndex < numWindows; ++windowIndex) tmp += estimatedFundamentalFrequencies[windowIndex]; int averageEstimatedFrequency = (int)(tmp / numWindows); // Could also return useful information like standard deviation, pitch acceleration, rising/falling, etc. return averageEstimatedFrequency; }
/// <summary> /// Sets where synthesized speech is rendered to. This sets the output to wave file. /// </summary> /// <param name="path">The path where to save the wave to.</param> /// <param name="formatInfo">The format info for rendering the file.</param> public virtual void SetOutputToWaveFile(string path, SpeechAudioFormatInfo formatInfo) { throw new NotImplementedException(); }
public void addCommentToLocation(string streamOfComment, string latitude , string longitude) { SpeechAudioFormatInfo audioType = new SpeechAudioFormatInfo(1000,AudioBitsPerSample.Sixteen,AudioChannel.Mono); SpeechSynthesizer speech = new SpeechSynthesizer("SmartAudioCityGuide", "Lz+vYpOFm6NTP83A9y0tPoX6ByJa06Q6yxHvoBsD0xo="); byte[] streamString; Locations location = new Locations(); byte[] buffer = new byte[10]; MemoryStream stream = new MemoryStream(); using (SpeechRecognitionEngine speechRecongnizeEngine = new SpeechRecognitionEngine()) { location.latitude = Convert.ToDouble(latitude); location.longitude = Convert.ToDouble(longitude); locationsServices.addLocations(location); streamString = serializer.Deserialize<byte[]>(streamOfComment); buffer = new byte[streamString.Count()]; stream.Write(buffer, 0, buffer.Length); // Add a handler for the LoadGrammarCompleted event. speechRecongnizeEngine.LoadGrammarCompleted += new EventHandler<LoadGrammarCompletedEventArgs>(speechRecongnizeEngine_LoadGrammarCompleted); // Add a handler for the SpeechRecognized event. speechRecongnizeEngine.SpeechRecognized += new EventHandler<SpeechRecognizedEventArgs>(speechRecongnizeEngine_SpeechRecognized); speechRecongnizeEngine.LoadGrammar(new DictationGrammar()); speechRecongnizeEngine.SetInputToAudioStream(stream, audioType); speechRecongnizeEngine.RecognizeAsync(RecognizeMode.Multiple); } using (SpeechRecognizer recognizer = new SpeechRecognizer()) { // Create SemanticResultValue objects that contain cities and airport codes. SemanticResultValue chicago = new SemanticResultValue("Chicago", "ORD"); SemanticResultValue boston = new SemanticResultValue("Boston", "BOS"); SemanticResultValue miami = new SemanticResultValue("Miami", "MIA"); SemanticResultValue dallas = new SemanticResultValue("Dallas", "DFW"); // Create a Choices object and add the SemanticResultValue objects, using // implicit conversion from SemanticResultValue to GrammarBuilder Choices cities = new Choices(); cities.Add(new Choices(new GrammarBuilder[] { chicago, boston, miami, dallas })); // Build the phrase and add SemanticResultKeys. GrammarBuilder chooseCities = new GrammarBuilder(); chooseCities.Append("I want to fly from"); chooseCities.Append(new SemanticResultKey("origin", cities)); chooseCities.Append("to"); chooseCities.Append(new SemanticResultKey("destination", cities)); // Build a Grammar object from the GrammarBuilder. Grammar bookFlight = new Grammar(chooseCities); bookFlight.Name = "Book Flight"; // Add a handler for the LoadGrammarCompleted event. recognizer.LoadGrammarCompleted += new EventHandler<LoadGrammarCompletedEventArgs>(recognizer_LoadGrammarCompleted); // Add a handler for the SpeechRecognized event. recognizer.SpeechRecognized += new EventHandler<SpeechRecognizedEventArgs>(recognizer_SpeechRecognized); // Attach event handlers for recognition events. recognizer.SpeechRecognized += new EventHandler<SpeechRecognizedEventArgs>( SpeechRecognizedHandler); recognizer.EmulateRecognizeCompleted += new EventHandler<EmulateRecognizeCompletedEventArgs>( EmulateRecognizeCompletedHandler); // Load the grammar object to the recognizer. recognizer.LoadGrammarAsync(bookFlight); } }
public void computeSpectrum(short[] audioArray, float[] freqOut, SpeechAudioFormatInfo speechAudioFormatInfo) { System.Diagnostics.Debug.WriteLine("SpeechEmotionRecognitionEngine::computeSpectrum()"); if (audioArray == null || freqOut == null) { System.Diagnostics.Debug.WriteLine("audioArray or freqOut is null"); return; } // For Debugging. // printDebugFormatInfo(speechAudioFormatInfo); // Only allow 8 or 16 bit audio. if (speechAudioFormatInfo.BitsPerSample != 8 && speechAudioFormatInfo.BitsPerSample != 16) { System.Diagnostics.Debug.WriteLine("Invalid BitsPerSample"); return; } int windowSize = audioArray.Length; int numWindows = audioArray.Length / (windowSize * speechAudioFormatInfo.ChannelCount); int height = windowSize / 2; int half = windowSize / 2; int maxSamples = half; float[] processed = new float[windowSize]; for (int i = 0; i < windowSize; ++i) processed[i] = 0.0f; float[] fftOut = new float[windowSize]; float[] inputAudio = new float[windowSize]; int[] estimatedFundamentalFrequencies = new int[numWindows]; for (int i = 0; i < windowSize; ++i) inputAudio[i] = (float)audioArray[i]; windowFunction(WindowFunction.HANNING, windowSize, inputAudio); // Take FFT. fft(inputAudio, null, fftOut, null, 1, speechAudioFormatInfo); // Compute power. for (int i = 0; i < windowSize; ++i) inputAudio[i] = (float)(fftOut[i] * fftOut[i]); // Tolonen and Karjalainen recommend taking the cube root // of the power, instead of the square root for (int i = 0; i < windowSize; i++) inputAudio[i] = (float)(Math.Pow(inputAudio[i], 1.0f / 3.0f)); // Take FFT. fft(inputAudio, null, fftOut, null, 1, speechAudioFormatInfo); for (int i = 0; i < half; i++) processed[i] += fftOut[i]; // Peak Pruning as described by Tolonen and Karjalainen, 2000 // Clip at zero, copy to temp array for (int i = 0; i < maxSamples; ++i) { if (processed[i] < 0.0) processed[i] = (float)0.0; fftOut[i] = processed[i]; } // Subtract a time-doubled signal (linearly interp.) from the original // (clipped) signal for (int i = 0; i < maxSamples; ++i) { if ((i % 2) == 0) processed[i] -= fftOut[i / 2]; else processed[i] -= ((fftOut[i / 2] + fftOut[i / 2 + 1]) / 2); } // Clip at zero again for (int i = 0; i < maxSamples; ++i) { if (processed[i] < 0.0) processed[i] = (float)0.0; } // Find new max float max = 0; for (int i = 1; i < maxSamples; i++) if (processed[i] > max) max = processed[i]; // Reverse and scale for (int i = 0; i < maxSamples; ++i) inputAudio[i] = processed[i] / (windowSize / 4); for (int i = 0; i < maxSamples; ++i) processed[maxSamples - 1 - i] = inputAudio[i]; // Finally, put it into bins in grayscaleOut[], normalized to a 0.0-1.0 scale for (int i = 0; i < height; ++i) { float bin0 = (float)(i) * maxSamples / height; float bin1 = (float)(i + 1) * maxSamples / height; float binwidth = bin1 - bin0; float value = 0.0f; if ((int)bin1 == (int)bin0) value = processed[(int)bin0]; else { value += processed[(int)bin0] * ((int)bin0 + 1 - bin0); bin0 = 1 + (int)bin0; while (bin0 < (int)bin1) { value += processed[(int)bin0]; bin0 += 1.0f; } value += processed[(int)bin1] * (bin1 - (int)bin1); value /= binwidth; } // Should we be clipping at max 1.0? // I trial-and-errored for a while, and I don't think the clipping is necessary. // if (value > 1.0) // value = 1.0f; if (value < 0.0) value = 0.0f; freqOut[i] = value; } }
public double getMaximumFrequencyValue(float[] fftRealOutput, float[] fftComplexOutput, SpeechAudioFormatInfo speechAudioFormatInfo) { if (fftRealOutput == null || fftComplexOutput == null) return -1; // Error. int numSamples = fftRealOutput.Length; if (fftComplexOutput.Length != numSamples) return -1; // Error. // Calculate fundamental frequency. int fundamentalFrequencySamples = 0; double maxValue = Math.Pow(fftRealOutput[fundamentalFrequencySamples], 2) + Math.Pow(fftComplexOutput[fundamentalFrequencySamples], 2); for (int i = 1; i < numSamples; ++i) { if (Math.Pow(fftRealOutput[i], 2) + Math.Pow(fftComplexOutput[i], 2) > maxValue) { fundamentalFrequencySamples = i; maxValue = Math.Pow(fftRealOutput[fundamentalFrequencySamples], 2) + Math.Pow(fftComplexOutput[fundamentalFrequencySamples], 2); } } // System.Diagnostics.Debug.WriteLine("maxFrequencyValue: " + maxValue); return maxValue; }
public double getFundamentalFrequency(float[] fftRealOutput, float[] fftComplexOutput, SpeechAudioFormatInfo speechAudioFormatInfo) { if (fftRealOutput == null || fftComplexOutput == null || speechAudioFormatInfo == null) return -1; // Error. int numSamples = fftRealOutput.Length; if (fftComplexOutput.Length != numSamples) return -1; // Error. // Calculate fundamental frequency. int fundamentalFrequencySamples = 0; double maxValue = Math.Pow(fftRealOutput[fundamentalFrequencySamples], 2) + Math.Pow(fftComplexOutput[fundamentalFrequencySamples], 2); for (int i = 1; i < numSamples; ++i) { if (Math.Pow(fftRealOutput[i], 2) + Math.Pow(fftComplexOutput[i], 2) > maxValue) { fundamentalFrequencySamples = i; maxValue = Math.Pow(fftRealOutput[fundamentalFrequencySamples], 2) + Math.Pow(fftComplexOutput[fundamentalFrequencySamples], 2); } } double fundamentalFrequency = fundamentalFrequencySamples * (speechAudioFormatInfo.SamplesPerSecond / ((double)numSamples / speechAudioFormatInfo.ChannelCount)); // System.Diagnostics.Debug.WriteLine("fundamentalFrequency: " + fundamentalFrequency); return fundamentalFrequency; }
public double[] correlation(short[] inputAudio, SpeechAudioFormatInfo speechAudioFormatInfo) { int size = inputAudio.Length / 2; // Initialize the correlation function to 0. double[] correlationFunction = new double[size]; for (int i = 0; i < size; ++i) correlationFunction[i] = 0; for (int shift = 0; shift < size; shift += speechAudioFormatInfo.ChannelCount) { for (int audioIndex = 0; audioIndex < size; audioIndex += speechAudioFormatInfo.ChannelCount) { // Can overflow happen here when setting an int to the result of multiplying 2 bytes? No, no it can't. Yeah that's right... /* double difference = (double)(inputAudio[audioIndex] - inputAudio[audioIndex + shift * speechAudioFormatInfo.ChannelCount]); correlationFunction[i] += (difference * difference); */ correlationFunction[shift] += (double)inputAudio[audioIndex] * (double)inputAudio[audioIndex + shift]; } correlationFunction[shift] /= size; } /* // For debugging. // Print the first window's correlation function, just to see what it looks like. String correlationFunctionString = ""; for (int correlationIndex = 0; correlationIndex < size; ++correlationIndex) correlationFunctionString += correlationFunction[correlationIndex] + "."; System.Diagnostics.Debug.WriteLine("Correlation function: " + correlationFunctionString); */ return correlationFunction; }
// Based off of Audacity source and numerical recipes. // Also, check out this useful website: // http://www.codeproject.com/KB/recipes/howtofft.aspx public void fft(float[] realIn, float[] imagIn, float[] realOut, float[] imagOut, int sign, SpeechAudioFormatInfo speechAudioFormatInfo) { int n, mmax, m, j, istep, i; double wtemp, wr, wpr, wpi, wi, theta, tempr, tempi; int numSamples = realIn.Length; int numBits = numBitsNeeded(numSamples); int numComplexSamples = numSamples * 2; if (!isPowerOfTwo(numSamples)) { System.Diagnostics.Debug.WriteLine(numSamples + " is not a power of two"); return; } if (imagOut == null) imagOut = new float[numSamples]; if (sign > 0) sign = 1; else sign = -1; if (fastBitReversalTable == null) initFFT(); // Do simultaneous data copy and bit-reversal ordering into interleaved intermediate output... float[] data = new float[numComplexSamples]; for (i = 0; i < numSamples; i++) { j = reverseBits(i, numBits); data[2 * j] = (float)realIn[i]; data[2 * j + 1] = (imagIn == null) ? 0.0f : imagIn[i]; } // Do the FFT itself... // Danielson-Lanzcos routine mmax = 2; n = numComplexSamples; while (n > mmax) { istep = mmax << 1; theta = sign * (2 * Math.PI / mmax); wtemp = Math.Sin(0.5 * theta); wpr = -2.0 * wtemp * wtemp; wpi = Math.Sin(theta); wr = 1.0; wi = 0.0; for (m = 1; m < mmax; m += 2) { for (i = m; i <= n; i += istep) { j = i + mmax; tempr = wr * data[j - 1] - wi * data[j]; tempi = wr * data[j] + wi * data[j - 1]; data[j - 1] = data[i - 1] - (float)tempr; data[j] = data[i] - (float)tempi; data[i - 1] += (float)tempr; data[i] += (float)tempi; } wtemp = wr; wr = wtemp * wpr - wi * wpi + wr; wi = wi * wpr + wtemp * wpi + wi; } mmax = istep; } // De-interleave the real/complex data into the outputs. for (i = 0; i < numSamples; ++i) { realOut[i] = data[2 * i]; imagOut[i] = data[2 * i + 1]; } // Happy =) }
private void saveButton_Click(object sender, EventArgs e) { string fileName = textBox1.Text.Trim(); if (fileName.Length != 0) { if (fileName.EndsWith(".wav")) { fileName = fileName.Substring(0, fileName.IndexOf(".wav")); } else { textBox1.Text = fileName + ".wav"; } if (fileName.Length > 8) { string oldName = fileName; fileName = fileName.Substring(0, 8); fileName += ".wav"; string warning = "The filename: " + oldName + " is too long.\nMax length for Taranis is 12 characters including file ending.\nFile will be renamed to " + fileName; MessageBox.Show(this, warning); } if (folderName == null) { folderName = Environment.CurrentDirectory; } string text = textBox2.Text; speaker.SelectVoice(voices[comboBox1.SelectedIndex]); var speechAudioFormatInfo = new SpeechAudioFormatInfo(EncodingFormat.ULaw, 32000, 8, 1, 16000, 2, null); speaker.SetOutputToWaveFile(folderName + "\\" + fileName, speechAudioFormatInfo); speaker.Speak(text); } }
/// <summary> /// Sets where synthesized speech is rendered to. This sets the output to audio stream. /// </summary> /// <param name="audioDestination">The audio destination.</param> /// <param name="formatInfo">The format info for rendering the stream.</param> public virtual void SetOutputToAudioStream(Stream audioDestination, SpeechAudioFormatInfo formatInfo) { throw new NotImplementedException(); }
private void StartSpeech(AssignedVoice vb, string outputfile) { WinAvailableVoice wv = (WinAvailableVoice)vb.root; // Find the best audio format to use for this voice. System.Collections.ObjectModel.ReadOnlyCollection<SpeechAudioFormatInfo> formats = wv.winVoice.VoiceInfo.SupportedAudioFormats; format = formats.FirstOrDefault(); if (format == null) { // The voice did not tell us its parameters, so we pick some. format = new SpeechAudioFormatInfo( 16000, // Samples per second AudioBitsPerSample.Sixteen, AudioChannel.Mono); } // First set up to synthesize the message into a WAV file. mstream = new FileStream(outputfile, FileMode.Create, FileAccess.Write); syn.SetOutputToWaveStream(mstream); pb = new PromptBuilder(); mainStyle = new PromptStyle(); // mainStyle.Volume = promptVol; syn.SelectVoice(wv.winVoice.VoiceInfo.Name); pb.StartStyle(mainStyle); }
// ========================================== // HANDLE SPEECH RECOGNITION // ========================================== public override void InitSpeechEngine() { base.InitSpeechEngine(false); try { WSRConfig cfg = WSRConfig.GetInstance(); WSRSpeechManager manager = WSRSpeechManager.GetInstance(); SpeechAudioFormatInfo format = new SpeechAudioFormatInfo(EncodingFormat.Pcm, 16000, 16, 1, 32000, 2, null); for( int i = 0 ; i < Sensors.Count ; i++) { KinectAudioSource source = Sensors[i].Sensor.AudioSource; source.EchoCancellationMode = EchoCancellationMode.CancellationAndSuppression; source.NoiseSuppression = true; source.BeamAngleMode = BeamAngleMode.Adaptive; //set the beam to adapt to the surrounding source.AutomaticGainControlEnabled = false; if (WSRConfig.GetInstance().Echo >= 0){ source.EchoCancellationSpeakerIndex = WSRConfig.GetInstance().Echo; } String prefix = "KINECT_" + i; cfg.logInfo(prefix, "AutomaticGainControlEnabled : " + source.AutomaticGainControlEnabled); cfg.logInfo(prefix, "BeamAngle : " + source.BeamAngle); cfg.logInfo(prefix, "EchoCancellationMode : " + source.EchoCancellationMode); cfg.logInfo(prefix, "EchoCancellationSpeakerIndex : " + source.EchoCancellationSpeakerIndex); cfg.logInfo(prefix, "NoiseSuppression : " + source.NoiseSuppression); cfg.logInfo(prefix, "SoundSourceAngle : " + source.SoundSourceAngle); cfg.logInfo(prefix, "SoundSourceAngleConfidence : " + source.SoundSourceAngleConfidence); var stream = source.Start(); // streamer = new SpeechStreamer(stream); // FIXME manager.AddEngine(prefix, cfg.language, cfg.confidence, stream, format); } } catch (Exception ex) { WSRConfig.GetInstance().logError("ENGINE", "Init Kinect Engines: " + ex.Message); } }
protected override void OnLoad(EventArgs e) { Visible = false; ShowInTaskbar = false; base.OnLoad(e); /* * Get all installed voices * */ var voices = speech.GetInstalledVoices(); string voice = ""; foreach (InstalledVoice v in voices) { if (v.Enabled) //voice = v.VoiceInfo.Name; Console.WriteLine(v.VoiceInfo.Name); } queuetimer = new System.Timers.Timer(250); queuetimer.Elapsed += (object sender, ElapsedEventArgs ev) => { TTSRequest r; if (Queue.TryDequeue(out r)) { Console.WriteLine("dequeing off of concurrent queue..."); if (r.Interrupt) { // stop current TTS if (IsSpeaking) { //speech.StopSpeaking(); } if (IsSounding) { //sound.Stop(); if(sound.PlaybackState == PlaybackState.Playing) { sound.Stop(); } } // clear queue SpeechQueue.Clear(); } if(!r.Reset) { SpeechQueue.Enqueue(r); } RequestCount++; } var eventdata = new Hashtable(); eventdata.Add("ProcessedRequests", RequestCount); eventdata.Add("QueuedRequests", SpeechQueue.Count); eventdata.Add("IsSpeaking", IsSounding); InstrumentationEvent blam = new InstrumentationEvent(); blam.EventName = "status"; blam.Data = eventdata; NotifyGui(blam.EventMessage()); }; // when this timer fires, it will pull off of the speech queue and speak it // the long delay also adds a little pause between tts requests. speechtimer = new System.Timers.Timer(250); speechtimer.Elapsed += (object sender, ElapsedEventArgs ev) => { if (IsSpeaking.Equals(false)) { if (SpeechQueue.Count > 0) { TTSRequest r = SpeechQueue.Dequeue(); Console.WriteLine("dequeuing off of speech queue"); IsSpeaking = true; speechtimer.Enabled = false; //speech.SpeakAsync(r.Text); //using (speech = new SpeechSynthesizer()) { speech = new SpeechSynthesizer(); speech.SpeakCompleted += speech_SpeakCompleted; format = new SpeechAudioFormatInfo(EncodingFormat.ALaw, 8000, 8, 1, 1, 2, null); //format = new SpeechAudioFormatInfo(11025, AudioBitsPerSample.Sixteen, AudioChannel.Mono); // var si = speech.GetType().GetMethod("SetOutputStream", BindingFlags.Instance | BindingFlags.NonPublic); stream = new MemoryStream(); //si.Invoke(speech, new object[] { stream, format, true, true }); //speech.SetOutputToWaveStream(stream); speech.SetOutputToAudioStream(stream, format); speech.SelectVoice(config.getVoice (r.Language, r.Voice)); int rate = (r.Speed * 2 - 10); Console.WriteLine(rate); try { speech.Rate = rate; } catch (ArgumentOutOfRangeException ex) { speech.Rate = 0; } speech.SpeakAsync(r.Text); //} synthesis.WaitOne(); speech.SpeakCompleted -= speech_SpeakCompleted; speech.SetOutputToNull(); speech.Dispose(); //IsSpeaking = false; IsSounding = true; stream.Position = 0; //WaveFormat.CreateCustomFormat(WaveFormatEncoding.WmaVoice9, 11025, 1, 16000, 2, 16) using(RawSourceWaveStream reader = new RawSourceWaveStream(stream, WaveFormat.CreateALawFormat(8000, 1))) { WaveStream ws = WaveFormatConversionStream.CreatePcmStream(reader); //var waveProvider = new MultiplexingWaveProvider(new IWaveProvider[] { ws }, 4); //waveProvider.ConnectInputToOutput(0, 3); sound = new WaveOutEvent(); // set output device *before* init Console.WriteLine("Output Device: " + OutputDeviceId); sound.DeviceNumber = OutputDeviceId; sound.Init(ws); //sound.Init(waveProvider); sound.PlaybackStopped += output_PlaybackStopped; // Console.WriteLine("playing here " + ws.Length); sound.Play(); } playback.WaitOne(); //IsSounding = false; speechtimer.Enabled = true; } } }; queuetimer.Enabled = true; queuetimer.Start(); speechtimer.Enabled = true; speechtimer.Start(); InitHTTPServer(); }
// Estimate the fundamental frequency using the correlation function. // Look for the first change in sign -- from negative to positive -- in the differentiated correlationFunction to approximate the fundamental frequency. public int estimateF0(double[] corr, SpeechAudioFormatInfo speechAudioFormatInfo) { int fundamentalPeriodSamples = 0; int jitter = 0; bool wasNegative = false; for (int i = 0; i < corr.Length - 1; ++i) { if (wasNegative) { if (corr[i + 1] - corr[i] >= 0) { if (jitter > 3) { i -= jitter; fundamentalPeriodSamples = i; break; } jitter++; } } else if (corr[i + 1] - corr[i] <= 0) { if (jitter > 3) { wasNegative = true; i -= jitter; jitter = 0; continue; } jitter++; } } int estimatedF0 = 0; if (fundamentalPeriodSamples > 0) estimatedF0 = (int)(1.0 * speechAudioFormatInfo.SamplesPerSecond / fundamentalPeriodSamples); // For debugging. System.Diagnostics.Debug.WriteLine("Estimated Fundamental Frequency: " + estimatedF0); return estimatedF0; }
public void TextToSpeech(string text) { this.Log.Debug("Creating wav file of: " + text); SpeechAudioFormatInfo synthFormat = new SpeechAudioFormatInfo(44100, AudioBitsPerSample.Sixteen, AudioChannel.Stereo); SpeechSynthesizer speechEngine = new SpeechSynthesizer(); this.Log.Debug("setting output: " + ttsSave); speechEngine.SetOutputToWaveFile(ttsSave, synthFormat); this.Log.Debug("speaking"); speechEngine.Speak(text); speechEngine.Dispose(); }
private void StartKinect( KinectSensor k ) { kinect = k; // ストリームの有効化 kinect.ColorStream.Enable( rgbFormat ); kinect.DepthStream.Enable( depthFormat ); // RGBカメラ用バッファの初期化 pixelBuffer = new byte[kinect.ColorStream.FramePixelDataLength]; bmpBuffer = new RenderTargetBitmap( kinect.ColorStream.FrameWidth, kinect.ColorStream.FrameHeight, 96, 96, PixelFormats.Default ); rgbImage.Source = bmpBuffer; // 距離カメラ用バッファの初期化 depthBuffer = new short[kinect.DepthStream.FramePixelDataLength]; depthColorPoint = new ColorImagePoint[kinect.DepthStream.FramePixelDataLength]; depthMaskBuffer = new byte[kinect.ColorStream.FramePixelDataLength]; // 骨格ストリームの有効化 kinect.SkeletonStream.Enable(); // 骨格ストリーム用のバッファの初期化 skeletonBuffer = new Skeleton[kinect.SkeletonStream.FrameSkeletonArrayLength]; playerGesture = new PlayerGesture[kinect.SkeletonStream.FrameSkeletonArrayLength]; for ( int i = 0; i < playerGesture.Length; i++ ) { playerGesture[i] = new PlayerGesture(); } // RGB,Depth,Skeletonのイベントを受け取るイベントハンドラの登録 kinect.AllFramesReady += new EventHandler<AllFramesReadyEventArgs>( kinect_AllFramesReady ); // Kinectセンサーからのストリーム取得を開始 // KinectSensorChooserでやってくれる //kinect.Start(); // 音声認識関連の設定 kinect.AudioSource.SoundSourceAngleChanged += AudioSource_SoundSourceAngleChanged; var stream = kinect.AudioSource.Start(); speechEngine = InitSpeechEngine(); speechEngine.SpeechRecognized += speechEngine_SpeechRecognized; var format = new SpeechAudioFormatInfo( EncodingFormat.Pcm, 16000, 16, 1, 32000, 2, null ); speechEngine.SetInputToAudioStream( stream, format ); speechEngine.RecognizeAsync( RecognizeMode.Multiple ); }