15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved. 25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be 35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file. 45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "content/browser/speech/audio_encoder.h" 65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h" 85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/logging.h" 95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/scoped_ptr.h" 105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/stl_util.h" 11868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/string_number_conversions.h" 125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "content/browser/speech/audio_buffer.h" 132a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "third_party/flac/include/FLAC/stream_encoder.h" 142a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "third_party/speex/include/speex/speex.h" 155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace content { 175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace { 185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//-------------------------------- FLACEncoder --------------------------------- 205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char* const kContentTypeFLAC = "audio/x-flac; rate="; 225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const int kFLACCompressionLevel = 0; // 0 for speed 235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class FLACEncoder : public AudioEncoder { 255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FLACEncoder(int sampling_rate, int bits_per_sample); 275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual ~FLACEncoder(); 285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual void Encode(const AudioChunk& raw_audio) OVERRIDE; 295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual void Flush() OVERRIDE; 305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) static FLAC__StreamEncoderWriteStatus WriteCallback( 335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const FLAC__StreamEncoder* encoder, 345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const FLAC__byte buffer[], 355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t bytes, 365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) unsigned samples, 375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) unsigned current_frame, 385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void* client_data); 395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FLAC__StreamEncoder* encoder_; 415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bool is_encoder_initialized_; 425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DISALLOW_COPY_AND_ASSIGN(FLACEncoder); 445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)FLAC__StreamEncoderWriteStatus FLACEncoder::WriteCallback( 475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const FLAC__StreamEncoder* encoder, 485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const FLAC__byte buffer[], 495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) size_t bytes, 505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) unsigned samples, 515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) unsigned current_frame, 525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void* client_data) { 535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FLACEncoder* me = static_cast<FLACEncoder*>(client_data); 545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(me->encoder_ == encoder); 555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) me->encoded_audio_buffer_.Enqueue(buffer, bytes); 565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return FLAC__STREAM_ENCODER_WRITE_STATUS_OK; 575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)FLACEncoder::FLACEncoder(int sampling_rate, int bits_per_sample) 605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : AudioEncoder(std::string(kContentTypeFLAC) + 615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::IntToString(sampling_rate), 625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bits_per_sample), 635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) encoder_(FLAC__stream_encoder_new()), 645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) is_encoder_initialized_(false) { 655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FLAC__stream_encoder_set_channels(encoder_, 1); 665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FLAC__stream_encoder_set_bits_per_sample(encoder_, bits_per_sample); 675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FLAC__stream_encoder_set_sample_rate(encoder_, sampling_rate); 685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FLAC__stream_encoder_set_compression_level(encoder_, kFLACCompressionLevel); 695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Initializing the encoder will cause sync bytes to be written to 715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // its output stream, so we wait until the first call to this method 725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // before doing so. 735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)FLACEncoder::~FLACEncoder() { 765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FLAC__stream_encoder_delete(encoder_); 775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void FLACEncoder::Encode(const AudioChunk& raw_audio) { 805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK_EQ(raw_audio.bytes_per_sample(), 2); 815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (!is_encoder_initialized_) { 825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const FLAC__StreamEncoderInitStatus encoder_status = 835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FLAC__stream_encoder_init_stream(encoder_, WriteCallback, NULL, NULL, 845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) NULL, this); 855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK_EQ(encoder_status, FLAC__STREAM_ENCODER_INIT_STATUS_OK); 865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) is_encoder_initialized_ = true; 875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // FLAC encoder wants samples as int32s. 905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const int num_samples = raw_audio.NumSamples(); 91c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles) scoped_ptr<FLAC__int32[]> flac_samples(new FLAC__int32[num_samples]); 925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FLAC__int32* flac_samples_ptr = flac_samples.get(); 935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < num_samples; ++i) 945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) flac_samples_ptr[i] = static_cast<FLAC__int32>(raw_audio.GetSample16(i)); 955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FLAC__stream_encoder_process(encoder_, &flac_samples_ptr, num_samples); 975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void FLACEncoder::Flush() { 1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) FLAC__stream_encoder_finish(encoder_); 1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//-------------------------------- SpeexEncoder -------------------------------- 1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char* const kContentTypeSpeex = "audio/x-speex-with-header-byte; rate="; 1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const int kSpeexEncodingQuality = 8; 1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const int kMaxSpeexFrameLength = 110; // (44kbps rate sampled at 32kHz). 1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Since the frame length gets written out as a byte in the encoded packet, 1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// make sure it is within the byte range. 1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength); 1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class SpeexEncoder : public AudioEncoder { 1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public: 1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) explicit SpeexEncoder(int sampling_rate, int bits_per_sample); 1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual ~SpeexEncoder(); 1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual void Encode(const AudioChunk& raw_audio) OVERRIDE; 1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) virtual void Flush() OVERRIDE {} 1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private: 1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) void* encoder_state_; 1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) SpeexBits bits_; 1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int samples_per_frame_; 1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) char encoded_frame_data_[kMaxSpeexFrameLength + 1]; // +1 for the frame size. 1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DISALLOW_COPY_AND_ASSIGN(SpeexEncoder); 1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}; 1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)SpeexEncoder::SpeexEncoder(int sampling_rate, int bits_per_sample) 1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : AudioEncoder(std::string(kContentTypeSpeex) + 1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) base::IntToString(sampling_rate), 1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bits_per_sample) { 1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // speex_bits_init() does not initialize all of the |bits_| struct. 1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) memset(&bits_, 0, sizeof(bits_)); 1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) speex_bits_init(&bits_); 1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) encoder_state_ = speex_encoder_init(&speex_wb_mode); 1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(encoder_state_); 1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_); 1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) DCHECK(samples_per_frame_ > 0); 1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int quality = kSpeexEncodingQuality; 1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality); 1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int vbr = 1; 1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr); 1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_)); 1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)SpeexEncoder::~SpeexEncoder() { 1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) speex_bits_destroy(&bits_); 1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) speex_encoder_destroy(encoder_state_); 1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void SpeexEncoder::Encode(const AudioChunk& raw_audio) { 1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) spx_int16_t* src_buffer = 1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) const_cast<spx_int16_t*>(raw_audio.SamplesData16()); 1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int num_samples = raw_audio.NumSamples(); 1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Drop incomplete frames, typically those which come in when recording stops. 1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) num_samples -= (num_samples % samples_per_frame_); 1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) for (int i = 0; i < num_samples; i += samples_per_frame_) { 1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) speex_bits_reset(&bits_); 1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) speex_encode_int(encoder_state_, src_buffer + i, &bits_); 1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // Encode the frame and place the size of the frame as the first byte. This 1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) // is the packet format for MIME type x-speex-with-header-byte. 1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1, 1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) kMaxSpeexFrameLength); 1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) encoded_frame_data_[0] = static_cast<char>(frame_length); 1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) encoded_audio_buffer_.Enqueue( 1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) reinterpret_cast<uint8*>(&encoded_frame_data_[0]), frame_length + 1); 1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) } 1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace 1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)AudioEncoder* AudioEncoder::Create(Codec codec, 1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int sampling_rate, 1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) int bits_per_sample) { 1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) if (codec == CODEC_FLAC) 1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return new FLACEncoder(sampling_rate, bits_per_sample); 1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return new SpeexEncoder(sampling_rate, bits_per_sample); 1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)AudioEncoder::AudioEncoder(const std::string& mime_type, int bits_per_sample) 1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) : encoded_audio_buffer_(1), /* Byte granularity of encoded samples. */ 1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) mime_type_(mime_type), 1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) bits_per_sample_(bits_per_sample) { 1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)AudioEncoder::~AudioEncoder() { 1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)scoped_refptr<AudioChunk> AudioEncoder::GetEncodedDataAndClear() { 1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) return encoded_audio_buffer_.DequeueAll(); 1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} 1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) 1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)} // namespace content 195