17c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
27c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden
37c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete WardenLicensed under the Apache License, Version 2.0 (the "License");
47c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Wardenyou may not use this file except in compliance with the License.
57c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete WardenYou may obtain a copy of the License at
67c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden
77c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden    http://www.apache.org/licenses/LICENSE-2.0
87c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden
97c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete WardenUnless required by applicable law or agreed to in writing, software
107c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Wardendistributed under the License is distributed on an "AS IS" BASIS,
117c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete WardenWITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
127c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete WardenSee the License for the specific language governing permissions and
137c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Wardenlimitations under the License.
147c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden==============================================================================*/
157c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden
167c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden#include "tensorflow/examples/wav_to_spectrogram/wav_to_spectrogram.h"
177c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden
187c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden#include <vector>
197c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden
207c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden#include "tensorflow/cc/ops/audio_ops.h"
217c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden#include "tensorflow/cc/ops/const_op.h"
227c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden#include "tensorflow/cc/ops/image_ops.h"
237c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden#include "tensorflow/cc/ops/standard_ops.h"
247c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden#include "tensorflow/core/framework/graph.pb.h"
257c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden#include "tensorflow/core/framework/tensor.h"
267c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden#include "tensorflow/core/graph/default_device.h"
277c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden#include "tensorflow/core/graph/graph_def_builder.h"
287c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden#include "tensorflow/core/lib/core/errors.h"
297c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden#include "tensorflow/core/lib/core/stringpiece.h"
307c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden#include "tensorflow/core/lib/core/threadpool.h"
317c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden#include "tensorflow/core/lib/io/path.h"
327c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden#include "tensorflow/core/lib/strings/stringprintf.h"
337c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden#include "tensorflow/core/platform/logging.h"
347c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden#include "tensorflow/core/platform/types.h"
357c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden#include "tensorflow/core/public/session.h"
367c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden#include "tensorflow/core/util/command_line_flags.h"
377c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden
387c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Wardenusing tensorflow::DT_FLOAT;
397c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Wardenusing tensorflow::DT_UINT8;
407c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Wardenusing tensorflow::Output;
417c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Wardenusing tensorflow::TensorShape;
427c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden
437c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden// Runs a TensorFlow graph to convert an audio file into a visualization.
447c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Wardentensorflow::Status WavToSpectrogram(const tensorflow::string& input_wav,
457c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden                                    tensorflow::int32 window_size,
467c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden                                    tensorflow::int32 stride, float brightness,
477c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden                                    const tensorflow::string& output_image) {
487c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  auto root = tensorflow::Scope::NewRootScope();
497c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  using namespace tensorflow::ops;  // NOLINT(build/namespaces)
507c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  // The following block creates a TensorFlow graph that:
517c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  //  - Reads and decodes the audio file into a tensor of float samples.
527c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  //  - Creates a float spectrogram from those samples.
537c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  //  - Scales, clamps, and converts that spectrogram to 0 to 255 uint8's.
547c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  //  - Reshapes the tensor so that it's [height, width, 1] for imaging.
557c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  //  - Encodes it as a PNG stream and saves it out to a file.
56b463cbaa874c390296fd23e9dac31cadc58211a3Loo Rong Jie  Output file_reader =
57b463cbaa874c390296fd23e9dac31cadc58211a3Loo Rong Jie      tensorflow::ops::ReadFile(root.WithOpName("input_wav"), input_wav);
587c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  DecodeWav wav_decoder =
597c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden      DecodeWav(root.WithOpName("wav_decoder"), file_reader);
607c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  Output spectrogram = AudioSpectrogram(root.WithOpName("spectrogram"),
617c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden                                        wav_decoder.audio, window_size, stride);
627c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  Output brightness_placeholder =
637c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden      Placeholder(root.WithOpName("brightness_placeholder"), DT_FLOAT,
647c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden                  Placeholder::Attrs().Shape(TensorShape({})));
657c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  Output mul = Mul(root.WithOpName("mul"), spectrogram, brightness_placeholder);
667c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  Output min_const = Const(root.WithOpName("min_const"), 255.0f);
677c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  Output min = Minimum(root.WithOpName("min"), mul, min_const);
687c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  Output cast = Cast(root.WithOpName("cast"), min, DT_UINT8);
697c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  Output expand_dims_const = Const(root.WithOpName("expand_dims_const"), -1);
707c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  Output expand_dims =
717c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden      ExpandDims(root.WithOpName("expand_dims"), cast, expand_dims_const);
727c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  Output squeeze = Squeeze(root.WithOpName("squeeze"), expand_dims,
732f70cef14b65008f5acc30820b83759090879754Anna R                           Squeeze::Attrs().Axis({0}));
747c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  Output png_encoder = EncodePng(root.WithOpName("png_encoder"), squeeze);
75b463cbaa874c390296fd23e9dac31cadc58211a3Loo Rong Jie  tensorflow::ops::WriteFile file_writer = tensorflow::ops::WriteFile(
76b463cbaa874c390296fd23e9dac31cadc58211a3Loo Rong Jie      root.WithOpName("output_image"), output_image, png_encoder);
777c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  tensorflow::GraphDef graph;
787c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  TF_RETURN_IF_ERROR(root.ToGraphDef(&graph));
797c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden
807c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  // Build a session object from this graph definition. The power of TensorFlow
817c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  // is that you can reuse complex computations like this, so usually we'd run a
827c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  // lot of different inputs through it. In this example, we're just doing a
837c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  // one-off run, so we'll create it and then use it immediately.
847c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  std::unique_ptr<tensorflow::Session> session(
857c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden      tensorflow::NewSession(tensorflow::SessionOptions()));
867c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  TF_RETURN_IF_ERROR(session->Create(graph));
877c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden
887c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  // We're passing in the brightness as an input, so create a tensor to hold the
897c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  // value.
907c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  tensorflow::Tensor brightness_tensor(DT_FLOAT, TensorShape({}));
917c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  brightness_tensor.scalar<float>()() = brightness;
927c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden
937c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  // Run the session to analyze the audio and write out the file.
947c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  TF_RETURN_IF_ERROR(
957c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden      session->Run({{"brightness_placeholder", brightness_tensor}}, {},
967c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden                   {"output_image"}, nullptr));
977c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden  return tensorflow::Status::OK();
987c9d2a458ee6cb925a0b3d23793d0e356a6eac12Pete Warden}
99