15821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Copyright (c) 2012 The Chromium Authors. All rights reserved.
25821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Use of this source code is governed by a BSD-style license that can be
35821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// found in the LICENSE file.
45821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
55821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "content/browser/speech/audio_encoder.h"
65821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
75821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/basictypes.h"
85821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/logging.h"
95821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/memory/scoped_ptr.h"
105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "base/stl_util.h"
11868fa2fe829687343ffae624259930155e16dbd8Torne (Richard Coles)#include "base/strings/string_number_conversions.h"
125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)#include "content/browser/speech/audio_buffer.h"
132a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "third_party/flac/include/FLAC/stream_encoder.h"
142a99a7e74a7f215066514fe81d2bfa6639d9edddTorne (Richard Coles)#include "third_party/speex/include/speex/speex.h"
155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace content {
175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)namespace {
185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//-------------------------------- FLACEncoder ---------------------------------
205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char* const kContentTypeFLAC = "audio/x-flac; rate=";
225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const int kFLACCompressionLevel = 0;  // 0 for speed
235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class FLACEncoder : public AudioEncoder {
255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FLACEncoder(int sampling_rate, int bits_per_sample);
275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual ~FLACEncoder();
285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual void Encode(const AudioChunk& raw_audio) OVERRIDE;
295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual void Flush() OVERRIDE;
305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  static FLAC__StreamEncoderWriteStatus WriteCallback(
335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      const FLAC__StreamEncoder* encoder,
345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      const FLAC__byte buffer[],
355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      size_t bytes,
365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      unsigned samples,
375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      unsigned current_frame,
385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      void* client_data);
395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FLAC__StreamEncoder* encoder_;
415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  bool is_encoder_initialized_;
425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DISALLOW_COPY_AND_ASSIGN(FLACEncoder);
445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)FLAC__StreamEncoderWriteStatus FLACEncoder::WriteCallback(
475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const FLAC__StreamEncoder* encoder,
485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const FLAC__byte buffer[],
495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    size_t bytes,
505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    unsigned samples,
515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    unsigned current_frame,
525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    void* client_data) {
535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FLACEncoder* me = static_cast<FLACEncoder*>(client_data);
545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK(me->encoder_ == encoder);
555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  me->encoded_audio_buffer_.Enqueue(buffer, bytes);
565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return FLAC__STREAM_ENCODER_WRITE_STATUS_OK;
575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)FLACEncoder::FLACEncoder(int sampling_rate, int bits_per_sample)
605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : AudioEncoder(std::string(kContentTypeFLAC) +
615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                   base::IntToString(sampling_rate),
625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                   bits_per_sample),
635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      encoder_(FLAC__stream_encoder_new()),
645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      is_encoder_initialized_(false) {
655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FLAC__stream_encoder_set_channels(encoder_, 1);
665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FLAC__stream_encoder_set_bits_per_sample(encoder_, bits_per_sample);
675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FLAC__stream_encoder_set_sample_rate(encoder_, sampling_rate);
685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FLAC__stream_encoder_set_compression_level(encoder_, kFLACCompressionLevel);
695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Initializing the encoder will cause sync bytes to be written to
715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // its output stream, so we wait until the first call to this method
725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // before doing so.
735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)FLACEncoder::~FLACEncoder() {
765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FLAC__stream_encoder_delete(encoder_);
775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void FLACEncoder::Encode(const AudioChunk& raw_audio) {
805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DCHECK_EQ(raw_audio.bytes_per_sample(), 2);
815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (!is_encoder_initialized_) {
825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    const FLAC__StreamEncoderInitStatus encoder_status =
835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        FLAC__stream_encoder_init_stream(encoder_, WriteCallback, NULL, NULL,
845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                         NULL, this);
855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    DCHECK_EQ(encoder_status, FLAC__STREAM_ENCODER_INIT_STATUS_OK);
865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    is_encoder_initialized_ = true;
875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // FLAC encoder wants samples as int32s.
905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  const int num_samples = raw_audio.NumSamples();
91c2e0dbddbe15c98d52c4786dac06cb8952a8ae6dTorne (Richard Coles)  scoped_ptr<FLAC__int32[]> flac_samples(new FLAC__int32[num_samples]);
925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FLAC__int32* flac_samples_ptr = flac_samples.get();
935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (int i = 0; i < num_samples; ++i)
945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    flac_samples_ptr[i] = static_cast<FLAC__int32>(raw_audio.GetSample16(i));
955821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
965821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FLAC__stream_encoder_process(encoder_, &flac_samples_ptr, num_samples);
975821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
985821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
995821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void FLACEncoder::Flush() {
1005821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  FLAC__stream_encoder_finish(encoder_);
1015821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1025821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1035821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)//-------------------------------- SpeexEncoder --------------------------------
1045821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1055821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const char* const kContentTypeSpeex = "audio/x-speex-with-header-byte; rate=";
1065821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const int kSpeexEncodingQuality = 8;
1075821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)const int kMaxSpeexFrameLength = 110;  // (44kbps rate sampled at 32kHz).
1085821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1095821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// Since the frame length gets written out as a byte in the encoded packet,
1105821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)// make sure it is within the byte range.
1115821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)COMPILE_ASSERT(kMaxSpeexFrameLength <= 0xFF, invalidLength);
1125821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1135821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)class SpeexEncoder : public AudioEncoder {
1145821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) public:
1155821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  explicit SpeexEncoder(int sampling_rate, int bits_per_sample);
1165821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual ~SpeexEncoder();
1175821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual void Encode(const AudioChunk& raw_audio) OVERRIDE;
1185821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  virtual void Flush() OVERRIDE {}
1195821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1205821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles) private:
1215821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  void* encoder_state_;
1225821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  SpeexBits bits_;
1235821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int samples_per_frame_;
1245821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  char encoded_frame_data_[kMaxSpeexFrameLength + 1];  // +1 for the frame size.
1255821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  DISALLOW_COPY_AND_ASSIGN(SpeexEncoder);
1265821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)};
1275821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1285821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)SpeexEncoder::SpeexEncoder(int sampling_rate, int bits_per_sample)
1295821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : AudioEncoder(std::string(kContentTypeSpeex) +
1305821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                   base::IntToString(sampling_rate),
1315821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                   bits_per_sample) {
1325821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   // speex_bits_init() does not initialize all of the |bits_| struct.
1335821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   memset(&bits_, 0, sizeof(bits_));
1345821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   speex_bits_init(&bits_);
1355821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   encoder_state_ = speex_encoder_init(&speex_wb_mode);
1365821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   DCHECK(encoder_state_);
1375821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   speex_encoder_ctl(encoder_state_, SPEEX_GET_FRAME_SIZE, &samples_per_frame_);
1385821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   DCHECK(samples_per_frame_ > 0);
1395821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   int quality = kSpeexEncodingQuality;
1405821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   speex_encoder_ctl(encoder_state_, SPEEX_SET_QUALITY, &quality);
1415821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   int vbr = 1;
1425821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   speex_encoder_ctl(encoder_state_, SPEEX_SET_VBR, &vbr);
1435821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)   memset(encoded_frame_data_, 0, sizeof(encoded_frame_data_));
1445821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1455821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1465821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)SpeexEncoder::~SpeexEncoder() {
1475821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  speex_bits_destroy(&bits_);
1485821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  speex_encoder_destroy(encoder_state_);
1495821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1505821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1515821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)void SpeexEncoder::Encode(const AudioChunk& raw_audio) {
1525821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  spx_int16_t* src_buffer =
1535821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      const_cast<spx_int16_t*>(raw_audio.SamplesData16());
1545821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  int num_samples = raw_audio.NumSamples();
1555821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  // Drop incomplete frames, typically those which come in when recording stops.
1565821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  num_samples -= (num_samples % samples_per_frame_);
1575821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  for (int i = 0; i < num_samples; i += samples_per_frame_) {
1585821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    speex_bits_reset(&bits_);
1595821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    speex_encode_int(encoder_state_, src_buffer + i, &bits_);
1605821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1615821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // Encode the frame and place the size of the frame as the first byte. This
1625821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    // is the packet format for MIME type x-speex-with-header-byte.
1635821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    int frame_length = speex_bits_write(&bits_, encoded_frame_data_ + 1,
1645821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                        kMaxSpeexFrameLength);
1655821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    encoded_frame_data_[0] = static_cast<char>(frame_length);
1665821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    encoded_audio_buffer_.Enqueue(
1675821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)        reinterpret_cast<uint8*>(&encoded_frame_data_[0]), frame_length + 1);
1685821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  }
1695821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1705821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1715821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace
1725821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1735821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)AudioEncoder* AudioEncoder::Create(Codec codec,
1745821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                   int sampling_rate,
1755821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)                                   int bits_per_sample) {
1765821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  if (codec == CODEC_FLAC)
1775821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    return new FLACEncoder(sampling_rate, bits_per_sample);
1785821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return new SpeexEncoder(sampling_rate, bits_per_sample);
1795821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1805821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1815821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)AudioEncoder::AudioEncoder(const std::string& mime_type, int bits_per_sample)
1825821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)    : encoded_audio_buffer_(1), /* Byte granularity of encoded samples. */
1835821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      mime_type_(mime_type),
1845821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)      bits_per_sample_(bits_per_sample) {
1855821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1865821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1875821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)AudioEncoder::~AudioEncoder() {
1885821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1895821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1905821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)scoped_refptr<AudioChunk> AudioEncoder::GetEncodedDataAndClear() {
1915821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)  return encoded_audio_buffer_.DequeueAll();
1925821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}
1935821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)
1945821806d5e7f356e8fa4b058a389a808ea183019Torne (Richard Coles)}  // namespace content
195