mcp/pkg/speech/server.go at main

main
  1package speech
  2
  3import (
  4	"encoding/json"
  5	"fmt"
  6	"runtime"
  7	"strings"
  8	"sync"
  9
 10	"github.com/xlgmokha/mcp/pkg/mcp"
 11)
 12
 13// TTSBackend represents a text-to-speech backend
 14type TTSBackend interface {
 15	Speak(text string, voice string, rate *int, volume *float64, output string) (string, error)
 16	ListVoices(language string) ([]Voice, error)
 17	SpeakFile(filepath string, voice string, rate *int, volume *float64, maxLines *int) (string, error)
 18	StopSpeech() (string, error)
 19	IsAvailable() bool
 20	GetName() string
 21}
 22
 23// Voice represents a TTS voice
 24type Voice struct {
 25	Name     string
 26	Language string
 27	Details  string
 28}
 29
 30// SpeechOperations represents the Speech MCP server operations
 31type SpeechOperations struct {
 32	mu      sync.RWMutex
 33	backend TTSBackend
 34}
 35
 36// NewSpeechOperations creates a new SpeechOperations helper
 37func NewSpeechOperations() *SpeechOperations {
 38	// Select appropriate TTS backend based on OS
 39	var backend TTSBackend
 40	switch runtime.GOOS {
 41	case "darwin":
 42		backend = &MacOSBackend{}
 43	case "linux":
 44		backend = &LinuxBackend{}
 45	default:
 46		// For unsupported OS, use a no-op backend
 47		backend = &UnsupportedBackend{os: runtime.GOOS}
 48	}
 49	
 50	return &SpeechOperations{
 51		backend: backend,
 52	}
 53}
 54
 55// New creates a new Speech MCP server
 56func New() (*mcp.Server, error) {
 57	speech := NewSpeechOperations()
 58	
 59	builder := mcp.NewServerBuilder("speech-server", "1.0.0")
 60
 61
 62
 63	// Add say tool
 64	builder.AddTool(mcp.NewTool("say", "Convert text to speech using system TTS. Supports voice selection, speech rate, volume, and audio file output", map[string]interface{}{
 65		"type": "object",
 66		"properties": map[string]interface{}{
 67			"text": map[string]interface{}{
 68				"type":        "string",
 69				"description": "The text to speak",
 70			},
 71			"voice": map[string]interface{}{
 72				"type":        "string",
 73				"description": "Voice to use (platform-specific). Use list_voices to see available options",
 74			},
 75			"rate": map[string]interface{}{
 76				"type":        "integer",
 77				"description": "Speech rate in words per minute (80-500). macOS: 80-500, Linux: 80-450",
 78				"minimum":     80,
 79				"maximum":     500,
 80			},
 81			"volume": map[string]interface{}{
 82				"type":        "number",
 83				"description": "Volume level (0.0-1.0). macOS only - Linux ignores this parameter",
 84				"minimum":     0.0,
 85				"maximum":     1.0,
 86			},
 87			"output": map[string]interface{}{
 88				"type":        "string",
 89				"description": "Output audio file path. macOS: .aiff, .wav, .m4a. Linux: .wav only",
 90			},
 91		},
 92		"required": []string{"text"},
 93	}, func(req mcp.CallToolRequest) (mcp.CallToolResult, error) {
 94		return speech.handleSay(req)
 95	}))
 96
 97	// Add list_voices tool
 98	builder.AddTool(mcp.NewTool("list_voices", "List available TTS voices on the system, optionally filtered by language", map[string]interface{}{
 99		"type": "object",
100		"properties": map[string]interface{}{
101			"language": map[string]interface{}{
102				"type":        "string",
103				"description": "Filter voices by language code (e.g., 'en', 'fr', 'de')",
104			},
105		},
106	}, func(req mcp.CallToolRequest) (mcp.CallToolResult, error) {
107		return speech.handleListVoices(req)
108	}))
109
110	// Add speak_file tool
111	builder.AddTool(mcp.NewTool("speak_file", "Read and speak the contents of a text file with optional line limiting", map[string]interface{}{
112		"type": "object",
113		"properties": map[string]interface{}{
114			"filepath": map[string]interface{}{
115				"type":        "string",
116				"description": "Path to the text file to read and speak",
117			},
118			"voice": map[string]interface{}{
119				"type":        "string",
120				"description": "Voice to use (platform-specific)",
121			},
122			"rate": map[string]interface{}{
123				"type":        "integer",
124				"description": "Speech rate in words per minute (80-500)",
125				"minimum":     80,
126				"maximum":     500,
127			},
128			"volume": map[string]interface{}{
129				"type":        "number",
130				"description": "Volume level (0.0-1.0). macOS only",
131				"minimum":     0.0,
132				"maximum":     1.0,
133			},
134			"max_lines": map[string]interface{}{
135				"type":        "integer",
136				"description": "Maximum number of lines to read from the file",
137				"minimum":     1,
138			},
139		},
140		"required": []string{"filepath"},
141	}, func(req mcp.CallToolRequest) (mcp.CallToolResult, error) {
142		return speech.handleSpeakFile(req)
143	}))
144
145	// Add stop_speech tool
146	builder.AddTool(mcp.NewTool("stop_speech", "Stop any currently playing speech synthesis", map[string]interface{}{
147		"type": "object",
148	}, func(req mcp.CallToolRequest) (mcp.CallToolResult, error) {
149		return speech.handleStopSpeech(req)
150	}))
151
152	// Add speech_settings tool
153	builder.AddTool(mcp.NewTool("speech_settings", "Get information about the speech system including platform, backend, and usage help", map[string]interface{}{
154		"type": "object",
155	}, func(req mcp.CallToolRequest) (mcp.CallToolResult, error) {
156		return speech.handleSpeechSettings(req)
157	}))
158
159	return builder.Build(), nil
160}
161
162// handleSay speaks the provided text using the system TTS
163func (s *SpeechOperations) handleSay(req mcp.CallToolRequest) (mcp.CallToolResult, error) {
164	s.mu.RLock()
165	defer s.mu.RUnlock()
166
167	var args struct {
168		Text   string  `json:"text"`
169		Voice  string  `json:"voice,omitempty"`
170		Rate   *int    `json:"rate,omitempty"`   // Words per minute (80-500)
171		Volume *float64 `json:"volume,omitempty"` // 0.0 to 1.0
172		Output string  `json:"output,omitempty"` // File to save audio to
173	}
174
175	argsBytes, _ := json.Marshal(req.Arguments)
176	if err := json.Unmarshal(argsBytes, &args); err != nil {
177		return mcp.CallToolResult{}, fmt.Errorf("invalid arguments: %w", err)
178	}
179
180	if args.Text == "" {
181		return mcp.CallToolResult{}, fmt.Errorf("text is required")
182	}
183
184	// Check if TTS is available on this system
185	if !s.backend.IsAvailable() {
186		return mcp.CallToolResult{}, fmt.Errorf("speech synthesis is not available on this system (backend: %s)", s.backend.GetName())
187	}
188
189	// Use backend to speak
190	result, err := s.backend.Speak(args.Text, args.Voice, args.Rate, args.Volume, args.Output)
191	
192	if err != nil {
193		result += fmt.Sprintf("\nError: %v", err)
194	}
195
196	return mcp.CallToolResult{
197		Content: []mcp.Content{
198			mcp.TextContent{
199				Type: "text",
200				Text: result,
201			},
202		},
203	}, nil
204}
205
206// handleListVoices lists all available system voices
207func (s *SpeechOperations) handleListVoices(req mcp.CallToolRequest) (mcp.CallToolResult, error) {
208	s.mu.RLock()
209	defer s.mu.RUnlock()
210
211	var args struct {
212		Language string `json:"language,omitempty"` // Filter by language code (e.g., "en", "es")
213		Detailed bool   `json:"detailed,omitempty"` // Include detailed voice information
214	}
215
216	argsBytes, _ := json.Marshal(req.Arguments)
217	if err := json.Unmarshal(argsBytes, &args); err != nil {
218		return mcp.CallToolResult{}, fmt.Errorf("invalid arguments: %w", err)
219	}
220
221	// Check if TTS is available on this system
222	if !s.backend.IsAvailable() {
223		return mcp.CallToolResult{}, fmt.Errorf("voice listing is not available on this system (backend: %s)", s.backend.GetName())
224	}
225
226	voices, err := s.backend.ListVoices(args.Language)
227	if err != nil {
228		return mcp.CallToolResult{}, fmt.Errorf("failed to list voices: %v", err)
229	}
230
231	var result strings.Builder
232	result.WriteString(fmt.Sprintf("Available voices (%s):\n\n", s.backend.GetName()))
233	
234	for _, voice := range voices {
235		if args.Detailed {
236			result.WriteString(voice.Details)
237			result.WriteString("\n")
238		} else {
239			result.WriteString(fmt.Sprintf("• %s (%s)\n", voice.Name, voice.Language))
240		}
241	}
242
243	if len(voices) == 0 {
244		result.WriteString("No voices found")
245		if args.Language != "" {
246			result.WriteString(fmt.Sprintf(" for language '%s'", args.Language))
247		}
248		result.WriteString("\n")
249	}
250
251	return mcp.CallToolResult{
252		Content: []mcp.Content{
253			mcp.TextContent{
254				Type: "text",
255				Text: result.String(),
256			},
257		},
258	}, nil
259}
260
261// handleSpeakFile speaks the contents of a text file
262func (s *SpeechOperations) handleSpeakFile(req mcp.CallToolRequest) (mcp.CallToolResult, error) {
263	s.mu.RLock()
264	defer s.mu.RUnlock()
265
266	var args struct {
267		FilePath string  `json:"file_path"`
268		Voice    string  `json:"voice,omitempty"`
269		Rate     *int    `json:"rate,omitempty"`
270		Volume   *float64 `json:"volume,omitempty"`
271		MaxLines *int    `json:"max_lines,omitempty"` // Limit lines to speak
272	}
273
274	argsBytes, _ := json.Marshal(req.Arguments)
275	if err := json.Unmarshal(argsBytes, &args); err != nil {
276		return mcp.CallToolResult{}, fmt.Errorf("invalid arguments: %w", err)
277	}
278
279	if args.FilePath == "" {
280		return mcp.CallToolResult{}, fmt.Errorf("file_path is required")
281	}
282
283	// Check if TTS is available on this system
284	if !s.backend.IsAvailable() {
285		return mcp.CallToolResult{}, fmt.Errorf("speech synthesis is not available on this system (backend: %s)", s.backend.GetName())
286	}
287
288	// Use backend to speak file
289	result, err := s.backend.SpeakFile(args.FilePath, args.Voice, args.Rate, args.Volume, args.MaxLines)
290	
291	if err != nil {
292		result += fmt.Sprintf("\nError: %v", err)
293	}
294
295	return mcp.CallToolResult{
296		Content: []mcp.Content{
297			mcp.TextContent{
298				Type: "text",
299				Text: result,
300			},
301		},
302	}, nil
303}
304
305// handleStopSpeech stops any currently playing speech
306func (s *SpeechOperations) handleStopSpeech(req mcp.CallToolRequest) (mcp.CallToolResult, error) {
307	s.mu.RLock()
308	defer s.mu.RUnlock()
309
310	// Check if TTS is available on this system
311	if !s.backend.IsAvailable() {
312		return mcp.CallToolResult{}, fmt.Errorf("speech control is not available on this system (backend: %s)", s.backend.GetName())
313	}
314
315	// Use backend to stop speech
316	result, err := s.backend.StopSpeech()
317	
318	if err != nil {
319		result += fmt.Sprintf("\nError: %v", err)
320	}
321
322	return mcp.CallToolResult{
323		Content: []mcp.Content{
324			mcp.TextContent{
325				Type: "text",
326				Text: result,
327			},
328		},
329	}, nil
330}
331
332// handleSpeechSettings provides information about speech synthesis settings
333func (s *SpeechOperations) handleSpeechSettings(req mcp.CallToolRequest) (mcp.CallToolResult, error) {
334	s.mu.RLock()
335	defer s.mu.RUnlock()
336
337	backendName := s.backend.GetName()
338	isAvailable := s.backend.IsAvailable()
339	
340	var result string
341	
342	if !isAvailable {
343		result = fmt.Sprintf(`Speech Synthesis Settings and Usage:
344
345BACKEND: %s (NOT AVAILABLE)
346
347To enable speech synthesis on this system, please install:
348• Linux: espeak-ng or espeak
349  - Ubuntu/Debian: sudo apt install espeak-ng
350  - Fedora/RHEL: sudo dnf install espeak-ng
351  - Arch: sudo pacman -S espeak-ng
352• macOS: Built-in 'say' command (already available)
353
354Once installed, restart the MCP speech server to detect the TTS backend.`, backendName)
355	} else {
356		var outputFormats string
357		var voiceExamples string
358		
359		switch runtime.GOOS {
360		case "darwin":
361			outputFormats = "• Save to file: .aiff, .wav, .m4a"
362			voiceExamples = "• Popular voices: Alex, Samantha, Victoria, Fred, Fiona"
363		case "linux":
364			outputFormats = "• Save to file: .wav"
365			voiceExamples = "• Popular voices: en+f1, en+m1, en+f2, en+m2"
366		default:
367			outputFormats = "• Output formats depend on system"
368			voiceExamples = "• Use 'list_voices' tool to see available voices"
369		}
370		
371		result = fmt.Sprintf(`Speech Synthesis Settings and Usage:
372
373BACKEND: %s ✓
374
375VOICES:
376• Use 'list_voices' tool to see all available voices
377%s
378• Specify with: {"voice": "voice_name"}
379
380RATE (Speed):
381• Range: 80-500 words per minute
382• Default: ~200 wpm
383• Specify with: {"rate": 150}
384
385VOLUME:
386• Range: 0.0 (silent) to 1.0 (maximum)
387• Default: system volume
388• Specify with: {"volume": 0.8}
389
390OUTPUT FORMATS:
391%s
392• Specify with: {"output": "/path/to/file.wav"}
393
394EXAMPLES:
3951. Basic speech:
396   {"text": "Hello, this is a test"}
397
3982. Custom voice and speed:
399   {"text": "Hello world", "voice": "en+f1", "rate": 120}
400
4013. Save to file:
402   {"text": "Recording test", "output": "~/speech.wav"}
403
4044. Speak file contents:
405   {"file_path": "~/document.txt", "max_lines": 10}
406
407CONTROLS:
408• Use 'stop_speech' to interrupt any playing speech
409• Multiple speech commands will queue automatically`, backendName, voiceExamples, outputFormats)
410	}
411
412	return mcp.CallToolResult{
413		Content: []mcp.Content{
414			mcp.TextContent{
415				Type: "text",
416				Text: result,
417			},
418		},
419	}, nil
420}