main
1package speech
2
3import (
4 "encoding/json"
5 "fmt"
6 "runtime"
7 "strings"
8 "sync"
9
10 "github.com/xlgmokha/mcp/pkg/mcp"
11)
12
13// TTSBackend represents a text-to-speech backend
14type TTSBackend interface {
15 Speak(text string, voice string, rate *int, volume *float64, output string) (string, error)
16 ListVoices(language string) ([]Voice, error)
17 SpeakFile(filepath string, voice string, rate *int, volume *float64, maxLines *int) (string, error)
18 StopSpeech() (string, error)
19 IsAvailable() bool
20 GetName() string
21}
22
23// Voice represents a TTS voice
24type Voice struct {
25 Name string
26 Language string
27 Details string
28}
29
30// SpeechOperations represents the Speech MCP server operations
31type SpeechOperations struct {
32 mu sync.RWMutex
33 backend TTSBackend
34}
35
36// NewSpeechOperations creates a new SpeechOperations helper
37func NewSpeechOperations() *SpeechOperations {
38 // Select appropriate TTS backend based on OS
39 var backend TTSBackend
40 switch runtime.GOOS {
41 case "darwin":
42 backend = &MacOSBackend{}
43 case "linux":
44 backend = &LinuxBackend{}
45 default:
46 // For unsupported OS, use a no-op backend
47 backend = &UnsupportedBackend{os: runtime.GOOS}
48 }
49
50 return &SpeechOperations{
51 backend: backend,
52 }
53}
54
55// New creates a new Speech MCP server
56func New() (*mcp.Server, error) {
57 speech := NewSpeechOperations()
58
59 builder := mcp.NewServerBuilder("speech-server", "1.0.0")
60
61
62
63 // Add say tool
64 builder.AddTool(mcp.NewTool("say", "Convert text to speech using system TTS. Supports voice selection, speech rate, volume, and audio file output", map[string]interface{}{
65 "type": "object",
66 "properties": map[string]interface{}{
67 "text": map[string]interface{}{
68 "type": "string",
69 "description": "The text to speak",
70 },
71 "voice": map[string]interface{}{
72 "type": "string",
73 "description": "Voice to use (platform-specific). Use list_voices to see available options",
74 },
75 "rate": map[string]interface{}{
76 "type": "integer",
77 "description": "Speech rate in words per minute (80-500). macOS: 80-500, Linux: 80-450",
78 "minimum": 80,
79 "maximum": 500,
80 },
81 "volume": map[string]interface{}{
82 "type": "number",
83 "description": "Volume level (0.0-1.0). macOS only - Linux ignores this parameter",
84 "minimum": 0.0,
85 "maximum": 1.0,
86 },
87 "output": map[string]interface{}{
88 "type": "string",
89 "description": "Output audio file path. macOS: .aiff, .wav, .m4a. Linux: .wav only",
90 },
91 },
92 "required": []string{"text"},
93 }, func(req mcp.CallToolRequest) (mcp.CallToolResult, error) {
94 return speech.handleSay(req)
95 }))
96
97 // Add list_voices tool
98 builder.AddTool(mcp.NewTool("list_voices", "List available TTS voices on the system, optionally filtered by language", map[string]interface{}{
99 "type": "object",
100 "properties": map[string]interface{}{
101 "language": map[string]interface{}{
102 "type": "string",
103 "description": "Filter voices by language code (e.g., 'en', 'fr', 'de')",
104 },
105 },
106 }, func(req mcp.CallToolRequest) (mcp.CallToolResult, error) {
107 return speech.handleListVoices(req)
108 }))
109
110 // Add speak_file tool
111 builder.AddTool(mcp.NewTool("speak_file", "Read and speak the contents of a text file with optional line limiting", map[string]interface{}{
112 "type": "object",
113 "properties": map[string]interface{}{
114 "filepath": map[string]interface{}{
115 "type": "string",
116 "description": "Path to the text file to read and speak",
117 },
118 "voice": map[string]interface{}{
119 "type": "string",
120 "description": "Voice to use (platform-specific)",
121 },
122 "rate": map[string]interface{}{
123 "type": "integer",
124 "description": "Speech rate in words per minute (80-500)",
125 "minimum": 80,
126 "maximum": 500,
127 },
128 "volume": map[string]interface{}{
129 "type": "number",
130 "description": "Volume level (0.0-1.0). macOS only",
131 "minimum": 0.0,
132 "maximum": 1.0,
133 },
134 "max_lines": map[string]interface{}{
135 "type": "integer",
136 "description": "Maximum number of lines to read from the file",
137 "minimum": 1,
138 },
139 },
140 "required": []string{"filepath"},
141 }, func(req mcp.CallToolRequest) (mcp.CallToolResult, error) {
142 return speech.handleSpeakFile(req)
143 }))
144
145 // Add stop_speech tool
146 builder.AddTool(mcp.NewTool("stop_speech", "Stop any currently playing speech synthesis", map[string]interface{}{
147 "type": "object",
148 }, func(req mcp.CallToolRequest) (mcp.CallToolResult, error) {
149 return speech.handleStopSpeech(req)
150 }))
151
152 // Add speech_settings tool
153 builder.AddTool(mcp.NewTool("speech_settings", "Get information about the speech system including platform, backend, and usage help", map[string]interface{}{
154 "type": "object",
155 }, func(req mcp.CallToolRequest) (mcp.CallToolResult, error) {
156 return speech.handleSpeechSettings(req)
157 }))
158
159 return builder.Build(), nil
160}
161
162// handleSay speaks the provided text using the system TTS
163func (s *SpeechOperations) handleSay(req mcp.CallToolRequest) (mcp.CallToolResult, error) {
164 s.mu.RLock()
165 defer s.mu.RUnlock()
166
167 var args struct {
168 Text string `json:"text"`
169 Voice string `json:"voice,omitempty"`
170 Rate *int `json:"rate,omitempty"` // Words per minute (80-500)
171 Volume *float64 `json:"volume,omitempty"` // 0.0 to 1.0
172 Output string `json:"output,omitempty"` // File to save audio to
173 }
174
175 argsBytes, _ := json.Marshal(req.Arguments)
176 if err := json.Unmarshal(argsBytes, &args); err != nil {
177 return mcp.CallToolResult{}, fmt.Errorf("invalid arguments: %w", err)
178 }
179
180 if args.Text == "" {
181 return mcp.CallToolResult{}, fmt.Errorf("text is required")
182 }
183
184 // Check if TTS is available on this system
185 if !s.backend.IsAvailable() {
186 return mcp.CallToolResult{}, fmt.Errorf("speech synthesis is not available on this system (backend: %s)", s.backend.GetName())
187 }
188
189 // Use backend to speak
190 result, err := s.backend.Speak(args.Text, args.Voice, args.Rate, args.Volume, args.Output)
191
192 if err != nil {
193 result += fmt.Sprintf("\nError: %v", err)
194 }
195
196 return mcp.CallToolResult{
197 Content: []mcp.Content{
198 mcp.TextContent{
199 Type: "text",
200 Text: result,
201 },
202 },
203 }, nil
204}
205
206// handleListVoices lists all available system voices
207func (s *SpeechOperations) handleListVoices(req mcp.CallToolRequest) (mcp.CallToolResult, error) {
208 s.mu.RLock()
209 defer s.mu.RUnlock()
210
211 var args struct {
212 Language string `json:"language,omitempty"` // Filter by language code (e.g., "en", "es")
213 Detailed bool `json:"detailed,omitempty"` // Include detailed voice information
214 }
215
216 argsBytes, _ := json.Marshal(req.Arguments)
217 if err := json.Unmarshal(argsBytes, &args); err != nil {
218 return mcp.CallToolResult{}, fmt.Errorf("invalid arguments: %w", err)
219 }
220
221 // Check if TTS is available on this system
222 if !s.backend.IsAvailable() {
223 return mcp.CallToolResult{}, fmt.Errorf("voice listing is not available on this system (backend: %s)", s.backend.GetName())
224 }
225
226 voices, err := s.backend.ListVoices(args.Language)
227 if err != nil {
228 return mcp.CallToolResult{}, fmt.Errorf("failed to list voices: %v", err)
229 }
230
231 var result strings.Builder
232 result.WriteString(fmt.Sprintf("Available voices (%s):\n\n", s.backend.GetName()))
233
234 for _, voice := range voices {
235 if args.Detailed {
236 result.WriteString(voice.Details)
237 result.WriteString("\n")
238 } else {
239 result.WriteString(fmt.Sprintf("• %s (%s)\n", voice.Name, voice.Language))
240 }
241 }
242
243 if len(voices) == 0 {
244 result.WriteString("No voices found")
245 if args.Language != "" {
246 result.WriteString(fmt.Sprintf(" for language '%s'", args.Language))
247 }
248 result.WriteString("\n")
249 }
250
251 return mcp.CallToolResult{
252 Content: []mcp.Content{
253 mcp.TextContent{
254 Type: "text",
255 Text: result.String(),
256 },
257 },
258 }, nil
259}
260
261// handleSpeakFile speaks the contents of a text file
262func (s *SpeechOperations) handleSpeakFile(req mcp.CallToolRequest) (mcp.CallToolResult, error) {
263 s.mu.RLock()
264 defer s.mu.RUnlock()
265
266 var args struct {
267 FilePath string `json:"file_path"`
268 Voice string `json:"voice,omitempty"`
269 Rate *int `json:"rate,omitempty"`
270 Volume *float64 `json:"volume,omitempty"`
271 MaxLines *int `json:"max_lines,omitempty"` // Limit lines to speak
272 }
273
274 argsBytes, _ := json.Marshal(req.Arguments)
275 if err := json.Unmarshal(argsBytes, &args); err != nil {
276 return mcp.CallToolResult{}, fmt.Errorf("invalid arguments: %w", err)
277 }
278
279 if args.FilePath == "" {
280 return mcp.CallToolResult{}, fmt.Errorf("file_path is required")
281 }
282
283 // Check if TTS is available on this system
284 if !s.backend.IsAvailable() {
285 return mcp.CallToolResult{}, fmt.Errorf("speech synthesis is not available on this system (backend: %s)", s.backend.GetName())
286 }
287
288 // Use backend to speak file
289 result, err := s.backend.SpeakFile(args.FilePath, args.Voice, args.Rate, args.Volume, args.MaxLines)
290
291 if err != nil {
292 result += fmt.Sprintf("\nError: %v", err)
293 }
294
295 return mcp.CallToolResult{
296 Content: []mcp.Content{
297 mcp.TextContent{
298 Type: "text",
299 Text: result,
300 },
301 },
302 }, nil
303}
304
305// handleStopSpeech stops any currently playing speech
306func (s *SpeechOperations) handleStopSpeech(req mcp.CallToolRequest) (mcp.CallToolResult, error) {
307 s.mu.RLock()
308 defer s.mu.RUnlock()
309
310 // Check if TTS is available on this system
311 if !s.backend.IsAvailable() {
312 return mcp.CallToolResult{}, fmt.Errorf("speech control is not available on this system (backend: %s)", s.backend.GetName())
313 }
314
315 // Use backend to stop speech
316 result, err := s.backend.StopSpeech()
317
318 if err != nil {
319 result += fmt.Sprintf("\nError: %v", err)
320 }
321
322 return mcp.CallToolResult{
323 Content: []mcp.Content{
324 mcp.TextContent{
325 Type: "text",
326 Text: result,
327 },
328 },
329 }, nil
330}
331
332// handleSpeechSettings provides information about speech synthesis settings
333func (s *SpeechOperations) handleSpeechSettings(req mcp.CallToolRequest) (mcp.CallToolResult, error) {
334 s.mu.RLock()
335 defer s.mu.RUnlock()
336
337 backendName := s.backend.GetName()
338 isAvailable := s.backend.IsAvailable()
339
340 var result string
341
342 if !isAvailable {
343 result = fmt.Sprintf(`Speech Synthesis Settings and Usage:
344
345BACKEND: %s (NOT AVAILABLE)
346
347To enable speech synthesis on this system, please install:
348• Linux: espeak-ng or espeak
349 - Ubuntu/Debian: sudo apt install espeak-ng
350 - Fedora/RHEL: sudo dnf install espeak-ng
351 - Arch: sudo pacman -S espeak-ng
352• macOS: Built-in 'say' command (already available)
353
354Once installed, restart the MCP speech server to detect the TTS backend.`, backendName)
355 } else {
356 var outputFormats string
357 var voiceExamples string
358
359 switch runtime.GOOS {
360 case "darwin":
361 outputFormats = "• Save to file: .aiff, .wav, .m4a"
362 voiceExamples = "• Popular voices: Alex, Samantha, Victoria, Fred, Fiona"
363 case "linux":
364 outputFormats = "• Save to file: .wav"
365 voiceExamples = "• Popular voices: en+f1, en+m1, en+f2, en+m2"
366 default:
367 outputFormats = "• Output formats depend on system"
368 voiceExamples = "• Use 'list_voices' tool to see available voices"
369 }
370
371 result = fmt.Sprintf(`Speech Synthesis Settings and Usage:
372
373BACKEND: %s ✓
374
375VOICES:
376• Use 'list_voices' tool to see all available voices
377%s
378• Specify with: {"voice": "voice_name"}
379
380RATE (Speed):
381• Range: 80-500 words per minute
382• Default: ~200 wpm
383• Specify with: {"rate": 150}
384
385VOLUME:
386• Range: 0.0 (silent) to 1.0 (maximum)
387• Default: system volume
388• Specify with: {"volume": 0.8}
389
390OUTPUT FORMATS:
391%s
392• Specify with: {"output": "/path/to/file.wav"}
393
394EXAMPLES:
3951. Basic speech:
396 {"text": "Hello, this is a test"}
397
3982. Custom voice and speed:
399 {"text": "Hello world", "voice": "en+f1", "rate": 120}
400
4013. Save to file:
402 {"text": "Recording test", "output": "~/speech.wav"}
403
4044. Speak file contents:
405 {"file_path": "~/document.txt", "max_lines": 10}
406
407CONTROLS:
408• Use 'stop_speech' to interrupt any playing speech
409• Multiple speech commands will queue automatically`, backendName, voiceExamples, outputFormats)
410 }
411
412 return mcp.CallToolResult{
413 Content: []mcp.Content{
414 mcp.TextContent{
415 Type: "text",
416 Text: result,
417 },
418 },
419 }, nil
420}