diff --git a/cmd/transcribe.go b/cmd/transcribe.go index 16ab2d3..0f71968 100644 --- a/cmd/transcribe.go +++ b/cmd/transcribe.go @@ -27,7 +27,11 @@ speechly transcribe files.jsonl --model /path/to/model/bundle`, inputPath := args[0] if model != "" { - results, err := transcribeOnDevice(model, inputPath) + bs, err := cmd.Flags().GetInt("block-size") + if err != nil { + bs = 20 + } + results, err := transcribeOnDevice(model, inputPath, bs) printResults(results, inputPath, err == nil) if err != nil { log.Fatalf("Transcribing failed: %v", err) @@ -79,6 +83,7 @@ func printResults(results []AudioCorpusItem, inputPath string, reportErrors bool func init() { transcribeCmd.Flags().StringP("app", "a", "", "Application ID to use for cloud transcription") transcribeCmd.Flags().StringP("model", "m", "", "Model bundle file. This feature is available on Enterprise plans (https://speechly.com/pricing)") + transcribeCmd.Flags().Int("block-size", 20, "Block size to be used with the on-device decoder. (Enterprise plans only.)") transcribeCmd.Flags().Bool("streaming", false, "Use the Streaming API instead of the Batch API.") RootCmd.AddCommand(transcribeCmd) } diff --git a/cmd/transcribe_on_device.go b/cmd/transcribe_on_device.go index cc16f08..6e216e7 100644 --- a/cmd/transcribe_on_device.go +++ b/cmd/transcribe_on_device.go @@ -5,7 +5,7 @@ package cmd /* #cgo CFLAGS: -I${SRCDIR}/../decoder/include - #cgo darwin LDFLAGS: -L${SRCDIR}/../decoder/lib -Wl,-rpath,decoder/lib -lspeechly -lz -framework Foundation -lc++ -framework Security + #cgo darwin LDFLAGS: -L${SRCDIR}/../decoder/lib -Wl,-rpath,decoder/lib -lspeechlyDecoder -lz -framework Foundation -lc++ -framework Security #cgo linux LDFLAGS: -L${SRCDIR}/../decoder/lib -Wl,-rpath,$ORIGIN/../decoder/lib -Wl,--start-group -lstdc++ -lpthread -ldl -lm -lspeechly -lz #cgo tflite LDFLAGS: -ltensorflowlite_c #cgo coreml LDFLAGS: -framework coreml @@ -16,6 +16,7 @@ import "C" import ( "fmt" "os" + "io" "path" "strings" "unsafe" @@ -23,20 +24,34 @@ import ( "github.com/go-audio/audio" ) -func transcribeOnDevice(model string, corpusPath string) ([]AudioCorpusItem, error) { - ac, err := readAudioCorpus(corpusPath) +const ( + sampleBufferSize = 2048 + inputBufferSize = 2 * sampleBufferSize +) + +func transcribeOnDevice(model string, corpusPath string, blockSize int) ([]AudioCorpusItem, error) { + df, err := NewDecoderFactory(model) if err != nil { return nil, err } - df, err := NewDecoderFactory(model) + if corpusPath == "STDIN" { + d, err := df.NewStream("", blockSize) + if err != nil { + return nil, err + } + return nil, decodeStdin(d) + } + + ac, err := readAudioCorpus(corpusPath) if err != nil { return nil, err } + bar := getBar("Transcribing", "utt", len(ac)) var results []AudioCorpusItem for _, aci := range ac { - d, err := df.NewStream("") + d, err := df.NewStream("", blockSize) if err != nil { barClearOnError(bar) return nil, err @@ -64,6 +79,44 @@ func transcribeOnDevice(model string, corpusPath string) ([]AudioCorpusItem, err return results, nil } +func decodeStdin(d *cDecoder) (error) { + cErr := C.DecoderError{} + C.Decoder_EnableVAD(d.decoder, 1, &cErr); + + fmt.Println("Speechly Decoder ready!") + + go func () { + cErr := C.DecoderError{} + buffer := make([]byte, inputBufferSize) + sampleBuffer := make([]float32, sampleBufferSize) + for { + if _, err := io.ReadFull(os.Stdin, buffer); err != nil { + fmt.Println("error:", err) + break; + } + bufferPos := 0 + for i := 0; i < inputBufferSize; i += 2 { + s := int16((uint16(buffer[i]) | (uint16(buffer[i + 1]) << 8))) + sampleBuffer[bufferPos] = float32(s) / 32768.0 + bufferPos++ + } + C.Decoder_WriteSamples(d.decoder, (*C.float)(unsafe.Pointer(&sampleBuffer[0])), C.size_t(sampleBufferSize), C.int(0), &cErr) + } + }() + + for { + res := C.Decoder_WaitResults(d.decoder, &cErr) + if cErr.error_code != C.uint(0) { + return fmt.Errorf("failed reading transcript from decoder, error code %d", cErr.error_code) + } + word := C.GoString(res.word) + fmt.Printf("%s ", strings.ToLower(word)) + C.CResultWord_Destroy(res) + } + + return nil +} + func decodeAudioCorpusItem(audioFilePath string, aci AudioCorpusItem, d *cDecoder) (string, error) { cErr := C.DecoderError{} @@ -141,7 +194,7 @@ type cDecoder struct { index int } -func (d *decoderFactory) NewStream(deviceID string) (*cDecoder, error) { +func (d *decoderFactory) NewStream(deviceID string, blockSize int) (*cDecoder, error) { cDeviceID := C.CString(deviceID) cErr := C.DecoderError{} decoder := C.DecoderFactory_GetDecoder(d.factory, cDeviceID, &cErr) @@ -149,7 +202,7 @@ func (d *decoderFactory) NewStream(deviceID string) (*cDecoder, error) { return nil, fmt.Errorf("failed creating decoder instance, error code %d", cErr.error_code) } defer C.free(unsafe.Pointer(cDeviceID)) - C.Decoder_SetParamI(decoder, C.SPEECHLY_DECODER_BLOCK_MULTIPLIER_I, 20, &cErr); + C.Decoder_SetParamI(decoder, C.SPEECHLY_DECODER_BLOCK_MULTIPLIER_I, C.int(blockSize), &cErr) return &cDecoder{ decoder: decoder, }, nil diff --git a/cmd/transcribe_on_device_not_available.go b/cmd/transcribe_on_device_not_available.go index 1d91811..4f62091 100644 --- a/cmd/transcribe_on_device_not_available.go +++ b/cmd/transcribe_on_device_not_available.go @@ -7,6 +7,6 @@ import ( "fmt" ) -func transcribeOnDevice(bundlePath string, corpusPath string) ([]AudioCorpusItem, error) { +func transcribeOnDevice(bundlePath string, corpusPath string, blockSize int) ([]AudioCorpusItem, error) { return nil, fmt.Errorf("this version of the Speechly CLI tool does not support on-device transcription") }