mcp/pkg/speech/backends.go at main

main
  1package speech
  2
  3import (
  4	"fmt"
  5	"os"
  6	"os/exec"
  7	"path/filepath"
  8	"strconv"
  9	"strings"
 10)
 11
 12// MacOSBackend implements TTS using macOS 'say' command
 13type MacOSBackend struct{}
 14
 15func (m *MacOSBackend) Speak(text string, voice string, rate *int, volume *float64, output string) (string, error) {
 16	cmdArgs := []string{}
 17	
 18	if voice != "" {
 19		cmdArgs = append(cmdArgs, "-v", voice)
 20	}
 21	
 22	if rate != nil {
 23		if *rate < 80 || *rate > 500 {
 24			return "", fmt.Errorf("rate must be between 80-500 words per minute")
 25		}
 26		cmdArgs = append(cmdArgs, "-r", strconv.Itoa(*rate))
 27	}
 28	
 29	if volume != nil {
 30		if *volume < 0.0 || *volume > 1.0 {
 31			return "", fmt.Errorf("volume must be between 0.0 and 1.0")
 32		}
 33		// Convert to 0-100 scale for say command
 34		volumeInt := int(*volume * 100)
 35		cmdArgs = append(cmdArgs, "--volume", strconv.Itoa(volumeInt))
 36	}
 37	
 38	if output != "" {
 39		// Validate output file extension
 40		ext := strings.ToLower(filepath.Ext(output))
 41		if ext != ".aiff" && ext != ".wav" && ext != ".m4a" {
 42			return "", fmt.Errorf("output format must be .aiff, .wav, or .m4a")
 43		}
 44		cmdArgs = append(cmdArgs, "-o", output)
 45	}
 46	
 47	// Add the text to speak
 48	cmdArgs = append(cmdArgs, text)
 49
 50	cmd := exec.Command("say", cmdArgs...)
 51	output_bytes, err := cmd.CombinedOutput()
 52	
 53	var result string
 54	if output != "" {
 55		result = fmt.Sprintf("Audio saved to: %s", output)
 56	} else {
 57		result = fmt.Sprintf("Spoke: \"%s\"", text)
 58	}
 59	
 60	if len(output_bytes) > 0 {
 61		result += fmt.Sprintf("\nOutput: %s", string(output_bytes))
 62	}
 63	
 64	if err != nil {
 65		return result, err
 66	}
 67	
 68	return result, nil
 69}
 70
 71func (m *MacOSBackend) ListVoices(language string) ([]Voice, error) {
 72	cmd := exec.Command("say", "-v", "?")
 73	output, err := cmd.Output()
 74	
 75	if err != nil {
 76		return nil, fmt.Errorf("failed to list voices: %v", err)
 77	}
 78	
 79	voices := []Voice{}
 80	lines := strings.Split(string(output), "\n")
 81	
 82	for _, line := range lines {
 83		line = strings.TrimSpace(line)
 84		if line == "" {
 85			continue
 86		}
 87		
 88		// Filter by language if specified
 89		if language != "" && !strings.Contains(strings.ToLower(line), language) {
 90			continue
 91		}
 92		
 93		// Parse voice line (format: "Name  Language  # Details")
 94		parts := strings.Fields(line)
 95		if len(parts) >= 2 {
 96			voice := Voice{
 97				Name:     parts[0],
 98				Language: parts[1],
 99				Details:  line,
100			}
101			voices = append(voices, voice)
102		}
103	}
104	
105	return voices, nil
106}
107
108func (m *MacOSBackend) SpeakFile(filepath string, voice string, rate *int, volume *float64, maxLines *int) (string, error) {
109	// Read the file to get stats
110	content, err := os.ReadFile(filepath)
111	if err != nil {
112		return "", fmt.Errorf("failed to read file: %v", err)
113	}
114
115	text := string(content)
116	linesCount := len(strings.Split(text, "\n"))
117	wordsCount := len(strings.Fields(text))
118	
119	// Build say command
120	cmdArgs := []string{}
121	
122	if voice != "" {
123		cmdArgs = append(cmdArgs, "-v", voice)
124	}
125	
126	if rate != nil {
127		if *rate < 80 || *rate > 500 {
128			return "", fmt.Errorf("rate must be between 80-500 words per minute")
129		}
130		cmdArgs = append(cmdArgs, "-r", strconv.Itoa(*rate))
131	}
132	
133	if volume != nil {
134		if *volume < 0.0 || *volume > 1.0 {
135			return "", fmt.Errorf("volume must be between 0.0 and 1.0")
136		}
137		volumeInt := int(*volume * 100)
138		cmdArgs = append(cmdArgs, "--volume", strconv.Itoa(volumeInt))
139	}
140	
141	// If maxLines specified, speak text directly with limit
142	if maxLines != nil && *maxLines > 0 && *maxLines < linesCount {
143		lines := strings.Split(text, "\n")
144		lines = lines[:*maxLines]
145		limitedText := strings.Join(lines, "\n")
146		cmdArgs = append(cmdArgs, limitedText)
147		
148		cmd := exec.Command("say", cmdArgs...)
149		_, err := cmd.CombinedOutput()
150		
151		result := fmt.Sprintf("Speaking file: %s\nLines: %d (limited to %d), Words: ~%d", 
152			filepath, linesCount, *maxLines, len(strings.Fields(limitedText)))
153		
154		if err != nil {
155			return result, err
156		}
157		return result, nil
158	}
159	
160	// Otherwise use -f flag to speak entire file
161	cmdArgs = append(cmdArgs, "-f", filepath)
162
163	cmd := exec.Command("say", cmdArgs...)
164	_, err = cmd.CombinedOutput()
165	
166	result := fmt.Sprintf("Speaking file: %s\nLines: %d, Words: %d", 
167		filepath, linesCount, wordsCount)
168	
169	if err != nil {
170		return result, err
171	}
172	
173	return result, nil
174}
175
176func (m *MacOSBackend) StopSpeech() (string, error) {
177	cmd := exec.Command("pkill", "say")
178	err := cmd.Run()
179	
180	if err != nil {
181		// pkill returns error if no processes found, which is fine
182		return "Stopped all speech synthesis (no speech processes were running)", nil
183	}
184	
185	return "Stopped all speech synthesis", nil
186}
187
188func (m *MacOSBackend) IsAvailable() bool {
189	_, err := exec.LookPath("say")
190	return err == nil
191}
192
193func (m *MacOSBackend) GetName() string {
194	return "macOS say"
195}
196
197// LinuxBackend implements TTS using espeak-ng or espeak
198type LinuxBackend struct {
199	command string
200}
201
202func (l *LinuxBackend) getCommand() string {
203	if l.command != "" {
204		return l.command
205	}
206	
207	// Try espeak-ng first (newer, better quality)
208	if _, err := exec.LookPath("espeak-ng"); err == nil {
209		l.command = "espeak-ng"
210		return l.command
211	}
212	
213	// Fall back to espeak
214	if _, err := exec.LookPath("espeak"); err == nil {
215		l.command = "espeak"
216		return l.command
217	}
218	
219	return ""
220}
221
222func (l *LinuxBackend) Speak(text string, voice string, rate *int, volume *float64, output string) (string, error) {
223	cmd := l.getCommand()
224	if cmd == "" {
225		return "", fmt.Errorf("no TTS command available (install espeak-ng or espeak)")
226	}
227	
228	cmdArgs := []string{}
229	
230	// Add voice selection
231	if voice != "" {
232		cmdArgs = append(cmdArgs, "-v", voice)
233	}
234	
235	// Add speech rate (words per minute)
236	if rate != nil {
237		// espeak uses words per minute directly
238		cmdArgs = append(cmdArgs, "-s", strconv.Itoa(*rate))
239	}
240	
241	// Add volume (amplitude)
242	if volume != nil {
243		// espeak uses amplitude 0-200, with 100 as default
244		amplitude := int(*volume * 200)
245		cmdArgs = append(cmdArgs, "-a", strconv.Itoa(amplitude))
246	}
247	
248	// Add output file if specified
249	if output != "" {
250		// espeak supports wav output
251		ext := strings.ToLower(filepath.Ext(output))
252		if ext != ".wav" {
253			return "", fmt.Errorf("output format must be .wav for Linux TTS")
254		}
255		cmdArgs = append(cmdArgs, "-w", output)
256	}
257	
258	// Add the text
259	cmdArgs = append(cmdArgs, text)
260	
261	command := exec.Command(cmd, cmdArgs...)
262	output_bytes, err := command.CombinedOutput()
263	
264	var result string
265	if output != "" {
266		result = fmt.Sprintf("Audio saved to: %s", output)
267	} else {
268		result = fmt.Sprintf("Spoke: \"%s\"", text)
269	}
270	
271	if len(output_bytes) > 0 && !strings.Contains(string(output_bytes), "ALSA lib") {
272		// Filter out common ALSA warnings
273		result += fmt.Sprintf("\nOutput: %s", string(output_bytes))
274	}
275	
276	if err != nil {
277		return result, err
278	}
279	
280	return result, nil
281}
282
283func (l *LinuxBackend) ListVoices(language string) ([]Voice, error) {
284	cmd := l.getCommand()
285	if cmd == "" {
286		return nil, fmt.Errorf("no TTS command available (install espeak-ng or espeak)")
287	}
288	
289	command := exec.Command(cmd, "--voices")
290	output, err := command.Output()
291	
292	if err != nil {
293		return nil, fmt.Errorf("failed to list voices: %v", err)
294	}
295	
296	voices := []Voice{}
297	lines := strings.Split(string(output), "\n")
298	
299	// Skip header line
300	if len(lines) > 0 {
301		lines = lines[1:]
302	}
303	
304	for _, line := range lines {
305		line = strings.TrimSpace(line)
306		if line == "" {
307			continue
308		}
309		
310		// Parse espeak voice format
311		// Format: "Pty Language Age/Gender VoiceName        File        Other Languages"
312		fields := strings.Fields(line)
313		if len(fields) >= 4 {
314			lang := fields[1]
315			name := fields[3]
316			
317			// Filter by language if specified
318			if language != "" && !strings.Contains(strings.ToLower(lang), strings.ToLower(language)) {
319				continue
320			}
321			
322			voice := Voice{
323				Name:     name,
324				Language: lang,
325				Details:  line,
326			}
327			voices = append(voices, voice)
328		}
329	}
330	
331	return voices, nil
332}
333
334func (l *LinuxBackend) SpeakFile(filepath string, voice string, rate *int, volume *float64, maxLines *int) (string, error) {
335	// Read the file to get stats and handle maxLines
336	content, err := os.ReadFile(filepath)
337	if err != nil {
338		return "", fmt.Errorf("failed to read file: %v", err)
339	}
340
341	text := string(content)
342	linesCount := len(strings.Split(text, "\n"))
343	wordsCount := len(strings.Fields(text))
344	
345	// Limit lines if specified
346	actualText := text
347	if maxLines != nil && *maxLines > 0 && *maxLines < linesCount {
348		lines := strings.Split(text, "\n")
349		lines = lines[:*maxLines]
350		actualText = strings.Join(lines, "\n")
351	}
352	
353	// Use Speak method with the text
354	result, err := l.Speak(actualText, voice, rate, volume, "")
355	
356	fileInfo := fmt.Sprintf("Speaking file: %s\nLines: %d", filepath, linesCount)
357	if maxLines != nil && *maxLines < linesCount {
358		fileInfo += fmt.Sprintf(" (limited to %d)", *maxLines)
359	}
360	fileInfo += fmt.Sprintf(", Words: %d", wordsCount)
361	
362	if err != nil {
363		return fileInfo + "\n" + result, err
364	}
365	
366	return fileInfo + "\n" + result, nil
367}
368
369func (l *LinuxBackend) StopSpeech() (string, error) {
370	cmd := l.getCommand()
371	if cmd == "" {
372		return "No TTS command available", nil
373	}
374	
375	// Kill espeak/espeak-ng processes
376	exec.Command("pkill", cmd).Run()
377	
378	// Also try to kill common audio players that might be used
379	exec.Command("pkill", "aplay").Run()
380	exec.Command("pkill", "paplay").Run()
381	
382	return fmt.Sprintf("Stopped all %s processes", cmd), nil
383}
384
385func (l *LinuxBackend) IsAvailable() bool {
386	return l.getCommand() != ""
387}
388
389func (l *LinuxBackend) GetName() string {
390	cmd := l.getCommand()
391	if cmd != "" {
392		return cmd
393	}
394	return "Linux TTS (not available)"
395}
396
397// UnsupportedBackend for unsupported operating systems
398type UnsupportedBackend struct {
399	os string
400}
401
402func (u *UnsupportedBackend) Speak(text string, voice string, rate *int, volume *float64, output string) (string, error) {
403	return "", fmt.Errorf("speech synthesis is not supported on %s", u.os)
404}
405
406func (u *UnsupportedBackend) ListVoices(language string) ([]Voice, error) {
407	return nil, fmt.Errorf("voice listing is not supported on %s", u.os)
408}
409
410func (u *UnsupportedBackend) SpeakFile(filepath string, voice string, rate *int, volume *float64, maxLines *int) (string, error) {
411	return "", fmt.Errorf("file speaking is not supported on %s", u.os)
412}
413
414func (u *UnsupportedBackend) StopSpeech() (string, error) {
415	return "", fmt.Errorf("speech control is not supported on %s", u.os)
416}
417
418func (u *UnsupportedBackend) IsAvailable() bool {
419	return false
420}
421
422func (u *UnsupportedBackend) GetName() string {
423	return fmt.Sprintf("Unsupported (%s)", u.os)
424}