From 9138ccb01397d2decd7f7743b98d0c977d2a5407 Mon Sep 17 00:00:00 2001 From: Antti Ukkonen Date: Tue, 7 Mar 2023 16:35:42 +0200 Subject: [PATCH 1/4] first draft of realtime decoding from stdin --- cmd/transcribe_on_device.go | 52 ++++++++++++++++++++++++++++++++++--- 1 file changed, 48 insertions(+), 4 deletions(-) diff --git a/cmd/transcribe_on_device.go b/cmd/transcribe_on_device.go index cc16f08..a74aee1 100644 --- a/cmd/transcribe_on_device.go +++ b/cmd/transcribe_on_device.go @@ -5,7 +5,7 @@ package cmd /* #cgo CFLAGS: -I${SRCDIR}/../decoder/include - #cgo darwin LDFLAGS: -L${SRCDIR}/../decoder/lib -Wl,-rpath,decoder/lib -lspeechly -lz -framework Foundation -lc++ -framework Security + #cgo darwin LDFLAGS: -L${SRCDIR}/../decoder/lib -Wl,-rpath,decoder/lib -lspeechlyDecoder -lz -framework Foundation -lc++ -framework Security #cgo linux LDFLAGS: -L${SRCDIR}/../decoder/lib -Wl,-rpath,$ORIGIN/../decoder/lib -Wl,--start-group -lstdc++ -lpthread -ldl -lm -lspeechly -lz #cgo tflite LDFLAGS: -ltensorflowlite_c #cgo coreml LDFLAGS: -framework coreml @@ -16,6 +16,7 @@ import "C" import ( "fmt" "os" + "io" "path" "strings" "unsafe" @@ -24,15 +25,21 @@ import ( ) func transcribeOnDevice(model string, corpusPath string) ([]AudioCorpusItem, error) { - ac, err := readAudioCorpus(corpusPath) + df, err := NewDecoderFactory(model) if err != nil { return nil, err } - df, err := NewDecoderFactory(model) + if corpusPath == "STDIN" { + d, _ := df.NewStream("") + decodeStdin(d) + } + + ac, err := readAudioCorpus(corpusPath) if err != nil { return nil, err } + bar := getBar("Transcribing", "utt", len(ac)) var results []AudioCorpusItem for _, aci := range ac { @@ -64,6 +71,43 @@ func transcribeOnDevice(model string, corpusPath string) ([]AudioCorpusItem, err return results, nil } +func decodeStdin(d *cDecoder) (error) { + cErr := C.DecoderError{} + C.Decoder_EnableVAD(d.decoder, 1, &cErr); + + go func () { + cErr := C.DecoderError{} + buffer := make([]byte, 4096) + sampleBuffer := make([]float32, 2048) + for { + if _, err := io.ReadFull(os.Stdin, buffer); err != nil { + fmt.Println("error:", err) + break; + } + bufferPos := 0 + for i := 0; i < 4096; i += 2 { + s := int16((uint16(buffer[i]) | (uint16(buffer[i + 1]) << 8))) + // fmt.Println(s) + sampleBuffer[bufferPos] = float32(s) / 32768.0 + bufferPos++ + } + C.Decoder_WriteSamples(d.decoder, (*C.float)(unsafe.Pointer(&sampleBuffer[0])), C.size_t(2048), C.int(0), &cErr) + } + }() + + for { + res := C.Decoder_WaitResults(d.decoder, &cErr) + if cErr.error_code != C.uint(0) { + return fmt.Errorf("failed reading transcript from decoder, error code %d", cErr.error_code) + } + word := C.GoString(res.word) + fmt.Println(word) + C.CResultWord_Destroy(res) + } + + return nil +} + func decodeAudioCorpusItem(audioFilePath string, aci AudioCorpusItem, d *cDecoder) (string, error) { cErr := C.DecoderError{} @@ -149,7 +193,7 @@ func (d *decoderFactory) NewStream(deviceID string) (*cDecoder, error) { return nil, fmt.Errorf("failed creating decoder instance, error code %d", cErr.error_code) } defer C.free(unsafe.Pointer(cDeviceID)) - C.Decoder_SetParamI(decoder, C.SPEECHLY_DECODER_BLOCK_MULTIPLIER_I, 20, &cErr); + C.Decoder_SetParamI(decoder, C.SPEECHLY_DECODER_BLOCK_MULTIPLIER_I, 6, &cErr); return &cDecoder{ decoder: decoder, }, nil From b3c74ed075a8528721d601fd2a4dcc781af8a88c Mon Sep 17 00:00:00 2001 From: Antti Ukkonen Date: Tue, 7 Mar 2023 18:11:29 +0200 Subject: [PATCH 2/4] add block-size flag --- cmd/transcribe.go | 7 ++++++- cmd/transcribe_on_device.go | 12 ++++++------ cmd/transcribe_on_device_not_available.go | 2 +- 3 files changed, 13 insertions(+), 8 deletions(-) diff --git a/cmd/transcribe.go b/cmd/transcribe.go index 16ab2d3..0f71968 100644 --- a/cmd/transcribe.go +++ b/cmd/transcribe.go @@ -27,7 +27,11 @@ speechly transcribe files.jsonl --model /path/to/model/bundle`, inputPath := args[0] if model != "" { - results, err := transcribeOnDevice(model, inputPath) + bs, err := cmd.Flags().GetInt("block-size") + if err != nil { + bs = 20 + } + results, err := transcribeOnDevice(model, inputPath, bs) printResults(results, inputPath, err == nil) if err != nil { log.Fatalf("Transcribing failed: %v", err) @@ -79,6 +83,7 @@ func printResults(results []AudioCorpusItem, inputPath string, reportErrors bool func init() { transcribeCmd.Flags().StringP("app", "a", "", "Application ID to use for cloud transcription") transcribeCmd.Flags().StringP("model", "m", "", "Model bundle file. This feature is available on Enterprise plans (https://speechly.com/pricing)") + transcribeCmd.Flags().Int("block-size", 20, "Block size to be used with the on-device decoder. (Enterprise plans only.)") transcribeCmd.Flags().Bool("streaming", false, "Use the Streaming API instead of the Batch API.") RootCmd.AddCommand(transcribeCmd) } diff --git a/cmd/transcribe_on_device.go b/cmd/transcribe_on_device.go index a74aee1..8dae0b9 100644 --- a/cmd/transcribe_on_device.go +++ b/cmd/transcribe_on_device.go @@ -24,15 +24,15 @@ import ( "github.com/go-audio/audio" ) -func transcribeOnDevice(model string, corpusPath string) ([]AudioCorpusItem, error) { +func transcribeOnDevice(model string, corpusPath string, blockSize int) ([]AudioCorpusItem, error) { df, err := NewDecoderFactory(model) if err != nil { return nil, err } if corpusPath == "STDIN" { - d, _ := df.NewStream("") - decodeStdin(d) + d, _ := df.NewStream("", blockSize) + return nil, decodeStdin(d) } ac, err := readAudioCorpus(corpusPath) @@ -43,7 +43,7 @@ func transcribeOnDevice(model string, corpusPath string) ([]AudioCorpusItem, err bar := getBar("Transcribing", "utt", len(ac)) var results []AudioCorpusItem for _, aci := range ac { - d, err := df.NewStream("") + d, err := df.NewStream("", blockSize) if err != nil { barClearOnError(bar) return nil, err @@ -185,7 +185,7 @@ type cDecoder struct { index int } -func (d *decoderFactory) NewStream(deviceID string) (*cDecoder, error) { +func (d *decoderFactory) NewStream(deviceID string, blockSize int) (*cDecoder, error) { cDeviceID := C.CString(deviceID) cErr := C.DecoderError{} decoder := C.DecoderFactory_GetDecoder(d.factory, cDeviceID, &cErr) @@ -193,7 +193,7 @@ func (d *decoderFactory) NewStream(deviceID string) (*cDecoder, error) { return nil, fmt.Errorf("failed creating decoder instance, error code %d", cErr.error_code) } defer C.free(unsafe.Pointer(cDeviceID)) - C.Decoder_SetParamI(decoder, C.SPEECHLY_DECODER_BLOCK_MULTIPLIER_I, 6, &cErr); + C.Decoder_SetParamI(decoder, C.SPEECHLY_DECODER_BLOCK_MULTIPLIER_I, C.int(blockSize), &cErr) return &cDecoder{ decoder: decoder, }, nil diff --git a/cmd/transcribe_on_device_not_available.go b/cmd/transcribe_on_device_not_available.go index 1d91811..4f62091 100644 --- a/cmd/transcribe_on_device_not_available.go +++ b/cmd/transcribe_on_device_not_available.go @@ -7,6 +7,6 @@ import ( "fmt" ) -func transcribeOnDevice(bundlePath string, corpusPath string) ([]AudioCorpusItem, error) { +func transcribeOnDevice(bundlePath string, corpusPath string, blockSize int) ([]AudioCorpusItem, error) { return nil, fmt.Errorf("this version of the Speechly CLI tool does not support on-device transcription") } From 1c6eea3fc5d175e95b9f3d7d5f69a3e5e7ff80a2 Mon Sep 17 00:00:00 2001 From: Antti Ukkonen Date: Wed, 8 Mar 2023 15:16:34 +0200 Subject: [PATCH 3/4] clean up --- cmd/transcribe_on_device.go | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/cmd/transcribe_on_device.go b/cmd/transcribe_on_device.go index 8dae0b9..b562f68 100644 --- a/cmd/transcribe_on_device.go +++ b/cmd/transcribe_on_device.go @@ -24,6 +24,11 @@ import ( "github.com/go-audio/audio" ) +const ( + sampleBufferSize = 2048 + inputBufferSize = 2 * sampleBufferSize +) + func transcribeOnDevice(model string, corpusPath string, blockSize int) ([]AudioCorpusItem, error) { df, err := NewDecoderFactory(model) if err != nil { @@ -31,7 +36,10 @@ func transcribeOnDevice(model string, corpusPath string, blockSize int) ([]Audio } if corpusPath == "STDIN" { - d, _ := df.NewStream("", blockSize) + d, err := df.NewStream("", blockSize) + if err != nil { + return nil, err + } return nil, decodeStdin(d) } @@ -77,21 +85,20 @@ func decodeStdin(d *cDecoder) (error) { go func () { cErr := C.DecoderError{} - buffer := make([]byte, 4096) - sampleBuffer := make([]float32, 2048) + buffer := make([]byte, inputBufferSize) + sampleBuffer := make([]float32, sampleBufferSize) for { if _, err := io.ReadFull(os.Stdin, buffer); err != nil { fmt.Println("error:", err) break; } bufferPos := 0 - for i := 0; i < 4096; i += 2 { + for i := 0; i < inputBufferSize; i += 2 { s := int16((uint16(buffer[i]) | (uint16(buffer[i + 1]) << 8))) - // fmt.Println(s) sampleBuffer[bufferPos] = float32(s) / 32768.0 bufferPos++ } - C.Decoder_WriteSamples(d.decoder, (*C.float)(unsafe.Pointer(&sampleBuffer[0])), C.size_t(2048), C.int(0), &cErr) + C.Decoder_WriteSamples(d.decoder, (*C.float)(unsafe.Pointer(&sampleBuffer[0])), C.size_t(sampleBufferSize), C.int(0), &cErr) } }() @@ -101,7 +108,7 @@ func decodeStdin(d *cDecoder) (error) { return fmt.Errorf("failed reading transcript from decoder, error code %d", cErr.error_code) } word := C.GoString(res.word) - fmt.Println(word) + fmt.Printf("%s ", strings.ToLower(word)) C.CResultWord_Destroy(res) } From 42ebecdcf6d81eb5b4e62517bcd04a02d0c506d8 Mon Sep 17 00:00:00 2001 From: Antti Ukkonen Date: Wed, 8 Mar 2023 15:40:15 +0200 Subject: [PATCH 4/4] add message when model loaded --- cmd/transcribe_on_device.go | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cmd/transcribe_on_device.go b/cmd/transcribe_on_device.go index b562f68..6e216e7 100644 --- a/cmd/transcribe_on_device.go +++ b/cmd/transcribe_on_device.go @@ -83,6 +83,8 @@ func decodeStdin(d *cDecoder) (error) { cErr := C.DecoderError{} C.Decoder_EnableVAD(d.decoder, 1, &cErr); + fmt.Println("Speechly Decoder ready!") + go func () { cErr := C.DecoderError{} buffer := make([]byte, inputBufferSize)