[{"data":1,"prerenderedAt":1995},["ShallowReactive",2],{"i-mdi:open-in-new":3,"i-mdi:github":8,"i-mdi:menu":10,"i-local:logo":12,"blog-incident-response":15},{"left":4,"top":4,"width":5,"height":5,"rotate":4,"vFlip":6,"hFlip":6,"body":7},0,24,false,"\u003Cpath fill=\"currentColor\" d=\"M14 3v2h3.59l-9.83 9.83l1.41 1.41L19 6.41V10h2V3m-2 16H5V5h7V3H5a2 2 0 0 0-2 2v14a2 2 0 0 0 2 2h14a2 2 0 0 0 2-2v-7h-2z\"\u002F>",{"left":4,"top":4,"width":5,"height":5,"rotate":4,"vFlip":6,"hFlip":6,"body":9},"\u003Cpath fill=\"currentColor\" d=\"M12 2A10 10 0 0 0 2 12c0 4.42 2.87 8.17 6.84 9.5c.5.08.66-.23.66-.5v-1.69c-2.77.6-3.36-1.34-3.36-1.34c-.46-1.16-1.11-1.47-1.11-1.47c-.91-.62.07-.6.07-.6c1 .07 1.53 1.03 1.53 1.03c.87 1.52 2.34 1.07 2.91.83c.09-.65.35-1.09.63-1.34c-2.22-.25-4.55-1.11-4.55-4.92c0-1.11.38-2 1.03-2.71c-.1-.25-.45-1.29.1-2.64c0 0 .84-.27 2.75 1.02c.79-.22 1.65-.33 2.5-.33s1.71.11 2.5.33c1.91-1.29 2.75-1.02 2.75-1.02c.55 1.35.2 2.39.1 2.64c.65.71 1.03 1.6 1.03 2.71c0 3.82-2.34 4.66-4.57 4.91c.36.31.69.92.69 1.85V21c0 .27.16.59.67.5C19.14 20.16 22 16.42 22 12A10 10 0 0 0 12 2\"\u002F>",{"left":4,"top":4,"width":5,"height":5,"rotate":4,"vFlip":6,"hFlip":6,"body":11},"\u003Cpath fill=\"currentColor\" d=\"M3 6h18v2H3zm0 5h18v2H3zm0 5h18v2H3z\"\u002F>",{"left":4,"top":4,"width":13,"height":13,"rotate":4,"vFlip":6,"hFlip":6,"body":14},1200,"\u003Cdefs\n     id=\"defs1\" \u002F>\n  \u003Cpath\n     style=\"fill:#326ce5;fill-opacity:1;stroke:#326ce5;stroke-width:54.2178;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:0;stroke-dasharray:none;stroke-dashoffset:65.7793;stroke-opacity:1;paint-order:stroke fill markers\"\n     id=\"path10\"\n     d=\"M 778.34299,825.61572 273.93527,960.77136 -95.316804,591.51929 39.838835,87.111573 544.24655,-48.044069 913.49863,321.20801 Z\"\n     transform=\"matrix(0.89078213,-0.23868436,0.23868436,0.89078213,126.66226,291.12302)\" \u002F>\n  \u003Cpath\n     style=\"fill:#ffa400;fill-opacity:1;stroke:#ffffff;stroke-width:171.118;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:0;stroke-dasharray:none;stroke-dashoffset:65.7793;stroke-opacity:1;paint-order:stroke fill markers\"\n     id=\"path10-0-4\"\n     d=\"M 778.34299,825.61572 273.93527,960.77136 -95.316804,591.51929 39.838835,87.111573 544.24655,-48.044069 913.49863,321.20801 Z\"\n     transform=\"matrix(0.56447964,-0.15125188,0.15125188,0.56447964,300.05065,404.26778)\" \u002F>\n  \u003Cg\n     id=\"g24\"\n     transform=\"translate(-0.00289,-1.818185)\">\n    \u003Cpath\n       style=\"fill:#ffa400;fill-opacity:1;stroke:#ffffff;stroke-width:25;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:0;stroke-dasharray:none;stroke-dashoffset:65.7793;stroke-opacity:1;paint-order:stroke fill markers\"\n       d=\"M 236.36364,809.09091 960.89972,390.77981\"\n       id=\"path22\" \u002F>\n    \u003Cpath\n       style=\"fill:#ffa400;fill-opacity:1;stroke:#ffffff;stroke-width:25;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:0;stroke-dasharray:none;stroke-dashoffset:65.7793;stroke-opacity:1;paint-order:stroke fill markers\"\n       d=\"m 236.36364,389.09091 727.2785,419.89444\"\n       id=\"path23\" \u002F>\n    \u003Cpath\n       style=\"fill:#ffa400;fill-opacity:1;stroke:#ffffff;stroke-width:25;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:0;stroke-dasharray:none;stroke-dashoffset:65.7793;stroke-opacity:1;paint-order:stroke fill markers\"\n       d=\"M 600.18182,1010.9091 V 192.72727\"\n       id=\"path24\" \u002F>\n  \u003C\u002Fg>\n  \u003Ccircle\n     style=\"fill:none;stroke:#ffffff;stroke-width:42.7667;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:0;stroke-dashoffset:65.7793;paint-order:stroke fill markers\"\n     id=\"path1\"\n     cx=\"261.54996\"\n     cy=\"402.17349\"\n     r=\"19.439373\" \u002F>\n  \u003Ccircle\n     style=\"fill:none;stroke:#ffffff;stroke-width:42.7667;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:0;stroke-dashoffset:65.7793;paint-order:stroke fill markers\"\n     id=\"path1-3\"\n     cx=\"257.91364\"\n     cy=\"793.18634\"\n     r=\"19.439373\" \u002F>\n  \u003Ccircle\n     style=\"fill:none;stroke:#ffffff;stroke-width:42.7667;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:0;stroke-dashoffset:65.7793;paint-order:stroke fill markers\"\n     id=\"path1-5\"\n     cx=\"599.82275\"\n     cy=\"993.0954\"\n     r=\"19.439373\" \u002F>\n  \u003Ccircle\n     style=\"fill:none;stroke:#ffffff;stroke-width:42.7667;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:0;stroke-dashoffset:65.7793;paint-order:stroke fill markers\"\n     id=\"path1-2\"\n     cx=\"939.64093\"\n     cy=\"794.36816\"\n     r=\"19.439373\" \u002F>\n  \u003Ccircle\n     style=\"fill:none;stroke:#ffffff;stroke-width:42.7667;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:0;stroke-dashoffset:65.7793;paint-order:stroke fill markers\"\n     id=\"path1-8\"\n     cx=\"599.55005\"\n     cy=\"207.73181\"\n     r=\"19.439373\" \u002F>\n  \u003Ccircle\n     style=\"fill:none;stroke:#ffffff;stroke-width:42.7667;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:0;stroke-dashoffset:65.7793;paint-order:stroke fill markers\"\n     id=\"path1-1\"\n     cx=\"939.91364\"\n     cy=\"403.18637\"\n     r=\"19.439373\" \u002F>\n  \u003Ccircle\n     style=\"fill:none;stroke:#ffffff;stroke-width:42.7667;stroke-linecap:round;stroke-linejoin:round;stroke-miterlimit:0;stroke-dashoffset:65.7793;paint-order:stroke fill markers\"\n     id=\"path1-1-7\"\n     cx=\"600\"\n     cy=\"600\"\n     r=\"19.439373\" \u002F>",{"id":16,"title":17,"author":18,"authorUrl":19,"body":20,"category":1981,"date":1982,"description":1983,"extension":1984,"image":202,"meta":1985,"navigation":861,"path":1986,"seo":1987,"series":1988,"stem":1989,"tags":1990,"__hash__":1994},"blog\u002Fblog\u002Fincident-response.md","Automated incident response with AI agents on Kubernetes","rdmnl","https:\u002F\u002Fgithub.com\u002Frdmnl",{"type":21,"value":22,"toc":1969},"minimark",[23,42,45,53,85,92,95,98,103,106,180,183,187,196,221,225,228,235,279,300,308,316,320,325,531,550,555,643,650,655,812,819,876,882,886,889,1103,1106,1122,1126,1129,1210,1246,1252,1255,1272,1347,1350,1355,1438,1445,1450,1469,1475,1478,1482,1485,1602,1656,1660,1663,1735,1742,1813,1816,1820,1904,1907,1911,1928,1942,1944,1965],[24,25,26],"blockquote",{},[27,28,29,33,34,41],"p",{},[30,31,32],"strong",{},"TL;DR"," - Three agents investigate a production alert, diagnose root cause, and notify\nyour team on Slack. Uses mock MCP servers for PagerDuty, Grafana, and Slack - all running\nlocally. Total time: under 3 minutes. Total API cost: $0.00.\nAll files are in the ",[35,36,40],"a",{"href":37,"rel":38},"https:\u002F\u002Fgithub.com\u002Fkubeswarm\u002Fkubeswarm-cookbook\u002Ftree\u002Fmain\u002Frecipes\u002F16-incident-response",[39],"nofollow","examples directory",".",[43,44],"hr",{},[27,46,47,48,52],{},"This showed up in our ",[49,50,51],"code",{},"#incidents"," channel at 02:52 UTC:",[24,54,55,72],{},[27,56,57,60,61,64,65,68,69],{},[30,58,59],{},"Service:"," payments-api\n",[30,62,63],{},"Root Cause:"," PostgreSQL primary server failure leading to\ndatabase connection issues and exhausted connection pool\n",[30,66,67],{},"Severity:"," High\n",[30,70,71],{},"Remediation:",[73,74,75,79,82],"ul",{},[76,77,78],"li",{},"Promote the standby PostgreSQL replica to primary",[76,80,81],{},"Restart the payments-api deployment",[76,83,84],{},"Clear connection pool caches",[27,86,87,88,91],{},"Nobody on the team wrote that. An agent pipeline did. It pulled logs from Grafana,\nfound 312 error entries pointing to ",[49,89,90],{},"postgres-primary:5432 - connection refused",",\ncorrelated the spike with a failover event at 02:47, and posted the summary.\nThe PagerDuty alert got acknowledged automatically.",[27,93,94],{},"I wanted to see if I could wire this up with kubeswarm - three agents, each with\ndifferent tool access, where the one reading logs can't post to Slack and the one\nposting to Slack can't read logs. Turns out you can, and it's not that much YAML.",[27,96,97],{},"Here's how I built it.",[99,100,102],"h2",{"id":101},"the-setup","The setup",[27,104,105],{},"Three agents, one pipeline. I split it by access level on purpose - I don't want\na single agent that can both read production data and take actions on it.",[107,108,109,128],"table",{},[110,111,112],"thead",{},[113,114,115,119,122,125],"tr",{},[116,117,118],"th",{},"Agent",[116,120,121],{},"Role",[116,123,124],{},"MCP Tools",[116,126,127],{},"What it does",[129,130,131,148,164],"tbody",{},[113,132,133,139,142,145],{},[134,135,136],"td",{},[30,137,138],{},"Investigator",[134,140,141],{},"Gather evidence",[134,143,144],{},"Grafana (logs, metrics), PagerDuty (read)",[134,146,147],{},"Queries logs and metrics around the alert",[113,149,150,155,158,161],{},[134,151,152],{},[30,153,154],{},"Diagnostician",[134,156,157],{},"Analyze",[134,159,160],{},"None",[134,162,163],{},"Reads the evidence, identifies root cause",[113,165,166,171,174,177],{},[134,167,168],{},[30,169,170],{},"Notifier",[134,172,173],{},"Communicate",[134,175,176],{},"Slack (write), PagerDuty (acknowledge)",[134,178,179],{},"Posts findings, acks the alert",[27,181,182],{},"The diagnostician is intentionally tool-less. It gets the investigator's report,\nfigures out what went wrong, and passes that to the notifier. The agent that\ndecides what happened should never be the same agent that has write access.",[99,184,186],{"id":185},"prerequisites","Prerequisites",[27,188,189,190,195],{},"You need a Kubernetes cluster with kubeswarm installed\n(",[35,191,194],{"href":192,"rel":193},"https:\u002F\u002Fdocs.kubeswarm.io\u002Fquick-start",[39],"quick-start guide",") and Ollama running:",[197,198,203],"pre",{"className":199,"code":200,"language":201,"meta":202,"style":202},"language-bash shiki shiki-themes github-dark","ollama pull qwen2.5:7b\n","bash","",[49,204,205],{"__ignoreMap":202},[206,207,210,214,218],"span",{"class":208,"line":209},"line",1,[206,211,213],{"class":212},"svObZ","ollama",[206,215,217],{"class":216},"sU2Wk"," pull",[206,219,220],{"class":216}," qwen2.5:7b\n",[99,222,224],{"id":223},"mock-mcp-servers","Mock MCP servers",[27,226,227],{},"Obviously I'm not going to connect this to real PagerDuty and Grafana for a blog post.\nSo I wrote three tiny mock MCP servers in Go - about 80 lines each, no dependencies\noutside stdlib. They return realistic canned data over HTTP.",[27,229,230,231,234],{},"The mock Grafana returns error logs with ",[49,232,233],{},"connection refused to postgres-primary:5432","\nand a metrics spike at 02:47 UTC. Classic postgres failover scenario.",[197,236,238],{"className":199,"code":237,"language":201,"meta":202,"style":202},"kubectl apply -f namespace.yaml\nkubectl apply -f ollama-secret.yaml\nkubectl apply -f mock-servers.yaml\n",[49,239,240,255,267],{"__ignoreMap":202},[206,241,242,245,248,252],{"class":208,"line":209},[206,243,244],{"class":212},"kubectl",[206,246,247],{"class":216}," apply",[206,249,251],{"class":250},"sDLfK"," -f",[206,253,254],{"class":216}," namespace.yaml\n",[206,256,258,260,262,264],{"class":208,"line":257},2,[206,259,244],{"class":212},[206,261,247],{"class":216},[206,263,251],{"class":250},[206,265,266],{"class":216}," ollama-secret.yaml\n",[206,268,270,272,274,276],{"class":208,"line":269},3,[206,271,244],{"class":212},[206,273,247],{"class":216},[206,275,251],{"class":250},[206,277,278],{"class":216}," mock-servers.yaml\n",[197,280,282],{"className":199,"code":281,"language":201,"meta":202,"style":202},"kubectl get pods -n incident-response\n",[49,283,284],{"__ignoreMap":202},[206,285,286,288,291,294,297],{"class":208,"line":209},[206,287,244],{"class":212},[206,289,290],{"class":216}," get",[206,292,293],{"class":216}," pods",[206,295,296],{"class":250}," -n",[206,298,299],{"class":216}," incident-response\n",[197,301,306],{"className":302,"code":304,"language":305},[303],"language-text","NAME                              READY   STATUS    AGE\nmock-grafana-7b4f8d6c5-x2k9p     1\u002F1     Running   5s\nmock-pagerduty-5c9d8e7f4-m3n7q   1\u002F1     Running   5s\nmock-slack-6a8b9c0d1-r4s6t       1\u002F1     Running   5s\n","text",[49,307,304],{"__ignoreMap":202},[27,309,310,311,315],{},"Source code for the mocks is in the\n",[35,312,40],{"href":313,"rel":314},"https:\u002F\u002Fgithub.com\u002Fkubeswarm\u002Fkubeswarm-cookbook\u002Ftree\u002Fmain\u002Frecipes\u002F16-incident-response\u002Fmock-servers",[39],"\nif you want to look - they're straightforward.",[99,317,319],{"id":318},"the-agents","The agents",[27,321,322,324],{},[30,323,138],{}," - read-only access to observability:",[197,326,330],{"className":327,"code":328,"language":329,"meta":202,"style":202},"language-yaml shiki shiki-themes github-dark","# investigator-agent.yaml\nspec:\n  model: qwen2.5:7b\n  prompt:\n    inline: |\n      You are an SRE investigator. When you receive a PagerDuty incident,\n      gather evidence from logs and metrics. Output a structured JSON report\n      with incident_id, error_pattern, timeline, and raw_evidence.\n  tools:\n    mcp:\n      - name: pagerduty\n        url: \"http:\u002F\u002Fmock-pagerduty.incident-response.svc:8080\"\n      - name: grafana\n        url: \"http:\u002F\u002Fmock-grafana.incident-response.svc:8080\"\n  guardrails:\n    limits:\n      tokensPerCall: 4000\n      timeoutSeconds: 90\n    tools:\n      allow:\n        - \"pagerduty\u002Fget_incident\"\n        - \"grafana\u002F*\"\n","yaml",[49,331,332,338,348,359,367,379,385,391,397,405,413,427,438,450,460,468,476,487,498,506,514,523],{"__ignoreMap":202},[206,333,334],{"class":208,"line":209},[206,335,337],{"class":336},"sAwPA","# investigator-agent.yaml\n",[206,339,340,344],{"class":208,"line":257},[206,341,343],{"class":342},"s4JwU","spec",[206,345,347],{"class":346},"s95oV",":\n",[206,349,350,353,356],{"class":208,"line":269},[206,351,352],{"class":342},"  model",[206,354,355],{"class":346},": ",[206,357,358],{"class":216},"qwen2.5:7b\n",[206,360,362,365],{"class":208,"line":361},4,[206,363,364],{"class":342},"  prompt",[206,366,347],{"class":346},[206,368,370,373,375],{"class":208,"line":369},5,[206,371,372],{"class":342},"    inline",[206,374,355],{"class":346},[206,376,378],{"class":377},"snl16","|\n",[206,380,382],{"class":208,"line":381},6,[206,383,384],{"class":216},"      You are an SRE investigator. When you receive a PagerDuty incident,\n",[206,386,388],{"class":208,"line":387},7,[206,389,390],{"class":216},"      gather evidence from logs and metrics. Output a structured JSON report\n",[206,392,394],{"class":208,"line":393},8,[206,395,396],{"class":216},"      with incident_id, error_pattern, timeline, and raw_evidence.\n",[206,398,400,403],{"class":208,"line":399},9,[206,401,402],{"class":342},"  tools",[206,404,347],{"class":346},[206,406,408,411],{"class":208,"line":407},10,[206,409,410],{"class":342},"    mcp",[206,412,347],{"class":346},[206,414,416,419,422,424],{"class":208,"line":415},11,[206,417,418],{"class":346},"      - ",[206,420,421],{"class":342},"name",[206,423,355],{"class":346},[206,425,426],{"class":216},"pagerduty\n",[206,428,430,433,435],{"class":208,"line":429},12,[206,431,432],{"class":342},"        url",[206,434,355],{"class":346},[206,436,437],{"class":216},"\"http:\u002F\u002Fmock-pagerduty.incident-response.svc:8080\"\n",[206,439,441,443,445,447],{"class":208,"line":440},13,[206,442,418],{"class":346},[206,444,421],{"class":342},[206,446,355],{"class":346},[206,448,449],{"class":216},"grafana\n",[206,451,453,455,457],{"class":208,"line":452},14,[206,454,432],{"class":342},[206,456,355],{"class":346},[206,458,459],{"class":216},"\"http:\u002F\u002Fmock-grafana.incident-response.svc:8080\"\n",[206,461,463,466],{"class":208,"line":462},15,[206,464,465],{"class":342},"  guardrails",[206,467,347],{"class":346},[206,469,471,474],{"class":208,"line":470},16,[206,472,473],{"class":342},"    limits",[206,475,347],{"class":346},[206,477,479,482,484],{"class":208,"line":478},17,[206,480,481],{"class":342},"      tokensPerCall",[206,483,355],{"class":346},[206,485,486],{"class":250},"4000\n",[206,488,490,493,495],{"class":208,"line":489},18,[206,491,492],{"class":342},"      timeoutSeconds",[206,494,355],{"class":346},[206,496,497],{"class":250},"90\n",[206,499,501,504],{"class":208,"line":500},19,[206,502,503],{"class":342},"    tools",[206,505,347],{"class":346},[206,507,509,512],{"class":208,"line":508},20,[206,510,511],{"class":342},"      allow",[206,513,347],{"class":346},[206,515,517,520],{"class":208,"line":516},21,[206,518,519],{"class":346},"        - ",[206,521,522],{"class":216},"\"pagerduty\u002Fget_incident\"\n",[206,524,526,528],{"class":208,"line":525},22,[206,527,519],{"class":346},[206,529,530],{"class":216},"\"grafana\u002F*\"\n",[27,532,533,534,537,538,541,542,545,546,549],{},"Note the ",[49,535,536],{},"tools.allow"," list. It can call ",[49,539,540],{},"grafana\u002Fquery_logs"," and ",[49,543,544],{},"grafana\u002Fquery_metrics","\nbut if I added a ",[49,547,548],{},"slack"," MCP server here, the allow list would block it. I like that\nthe access control is declarative and visible in the YAML.",[27,551,552,554],{},[30,553,154],{}," - no tools, just reasoning:",[197,556,558],{"className":327,"code":557,"language":329,"meta":202,"style":202},"# diagnostician-agent.yaml\nspec:\n  model: qwen2.5:7b\n  prompt:\n    inline: |\n      You are a senior SRE diagnostician. Identify the root cause from the\n      investigation report. Suggest a specific remediation command, not\n      vague advice. Output JSON with root_cause, severity, remediation,\n      and confidence.\n  guardrails:\n    limits:\n      tokensPerCall: 3000\n      timeoutSeconds: 120\n",[49,559,560,565,571,579,585,593,598,603,608,613,619,625,634],{"__ignoreMap":202},[206,561,562],{"class":208,"line":209},[206,563,564],{"class":336},"# diagnostician-agent.yaml\n",[206,566,567,569],{"class":208,"line":257},[206,568,343],{"class":342},[206,570,347],{"class":346},[206,572,573,575,577],{"class":208,"line":269},[206,574,352],{"class":342},[206,576,355],{"class":346},[206,578,358],{"class":216},[206,580,581,583],{"class":208,"line":361},[206,582,364],{"class":342},[206,584,347],{"class":346},[206,586,587,589,591],{"class":208,"line":369},[206,588,372],{"class":342},[206,590,355],{"class":346},[206,592,378],{"class":377},[206,594,595],{"class":208,"line":381},[206,596,597],{"class":216},"      You are a senior SRE diagnostician. Identify the root cause from the\n",[206,599,600],{"class":208,"line":387},[206,601,602],{"class":216},"      investigation report. Suggest a specific remediation command, not\n",[206,604,605],{"class":208,"line":393},[206,606,607],{"class":216},"      vague advice. Output JSON with root_cause, severity, remediation,\n",[206,609,610],{"class":208,"line":399},[206,611,612],{"class":216},"      and confidence.\n",[206,614,615,617],{"class":208,"line":407},[206,616,465],{"class":342},[206,618,347],{"class":346},[206,620,621,623],{"class":208,"line":415},[206,622,473],{"class":342},[206,624,347],{"class":346},[206,626,627,629,631],{"class":208,"line":429},[206,628,481],{"class":342},[206,630,355],{"class":346},[206,632,633],{"class":250},"3000\n",[206,635,636,638,640],{"class":208,"line":440},[206,637,492],{"class":342},[206,639,355],{"class":346},[206,641,642],{"class":250},"120\n",[27,644,645,646,649],{},"No ",[49,647,648],{},"tools"," section at all. I went back and forth on whether this agent should have\naccess to anything. Decided no - the separation between \"reading\" and \"deciding\"\nis the whole point.",[27,651,652,654],{},[30,653,170],{}," - write access to Slack and PagerDuty:",[197,656,658],{"className":327,"code":657,"language":329,"meta":202,"style":202},"# notifier-agent.yaml\nspec:\n  model: qwen2.5:7b\n  prompt:\n    inline: |\n      You are an incident communications agent.\n      You MUST call post_message with channel \"#incidents\" and a text\n      summary. Then call acknowledge_incident with the incident_id.\n  tools:\n    mcp:\n      - name: slack\n        url: \"http:\u002F\u002Fmock-slack.incident-response.svc:8080\"\n      - name: pagerduty\n        url: \"http:\u002F\u002Fmock-pagerduty.incident-response.svc:8080\"\n  guardrails:\n    limits:\n      tokensPerCall: 3000\n      timeoutSeconds: 90\n    tools:\n      allow:\n        - \"slack\u002Fpost_message\"\n        - \"pagerduty\u002Facknowledge_incident\"\n",[49,659,660,665,671,679,685,693,698,703,708,714,720,731,740,750,758,764,770,778,786,792,798,805],{"__ignoreMap":202},[206,661,662],{"class":208,"line":209},[206,663,664],{"class":336},"# notifier-agent.yaml\n",[206,666,667,669],{"class":208,"line":257},[206,668,343],{"class":342},[206,670,347],{"class":346},[206,672,673,675,677],{"class":208,"line":269},[206,674,352],{"class":342},[206,676,355],{"class":346},[206,678,358],{"class":216},[206,680,681,683],{"class":208,"line":361},[206,682,364],{"class":342},[206,684,347],{"class":346},[206,686,687,689,691],{"class":208,"line":369},[206,688,372],{"class":342},[206,690,355],{"class":346},[206,692,378],{"class":377},[206,694,695],{"class":208,"line":381},[206,696,697],{"class":216},"      You are an incident communications agent.\n",[206,699,700],{"class":208,"line":387},[206,701,702],{"class":216},"      You MUST call post_message with channel \"#incidents\" and a text\n",[206,704,705],{"class":208,"line":393},[206,706,707],{"class":216},"      summary. Then call acknowledge_incident with the incident_id.\n",[206,709,710,712],{"class":208,"line":399},[206,711,402],{"class":342},[206,713,347],{"class":346},[206,715,716,718],{"class":208,"line":407},[206,717,410],{"class":342},[206,719,347],{"class":346},[206,721,722,724,726,728],{"class":208,"line":415},[206,723,418],{"class":346},[206,725,421],{"class":342},[206,727,355],{"class":346},[206,729,730],{"class":216},"slack\n",[206,732,733,735,737],{"class":208,"line":429},[206,734,432],{"class":342},[206,736,355],{"class":346},[206,738,739],{"class":216},"\"http:\u002F\u002Fmock-slack.incident-response.svc:8080\"\n",[206,741,742,744,746,748],{"class":208,"line":440},[206,743,418],{"class":346},[206,745,421],{"class":342},[206,747,355],{"class":346},[206,749,426],{"class":216},[206,751,752,754,756],{"class":208,"line":452},[206,753,432],{"class":342},[206,755,355],{"class":346},[206,757,437],{"class":216},[206,759,760,762],{"class":208,"line":462},[206,761,465],{"class":342},[206,763,347],{"class":346},[206,765,766,768],{"class":208,"line":470},[206,767,473],{"class":342},[206,769,347],{"class":346},[206,771,772,774,776],{"class":208,"line":478},[206,773,481],{"class":342},[206,775,355],{"class":346},[206,777,633],{"class":250},[206,779,780,782,784],{"class":208,"line":489},[206,781,492],{"class":342},[206,783,355],{"class":346},[206,785,497],{"class":250},[206,787,788,790],{"class":208,"line":500},[206,789,503],{"class":342},[206,791,347],{"class":346},[206,793,794,796],{"class":208,"line":508},[206,795,511],{"class":342},[206,797,347],{"class":346},[206,799,800,802],{"class":208,"line":516},[206,801,519],{"class":346},[206,803,804],{"class":216},"\"slack\u002Fpost_message\"\n",[206,806,807,809],{"class":208,"line":525},[206,808,519],{"class":346},[206,810,811],{"class":216},"\"pagerduty\u002Facknowledge_incident\"\n",[27,813,814,815,818],{},"The ",[49,816,817],{},"MUST call"," in the prompt is not elegant, but with a 7B model you sometimes\nneed to be blunt. Bigger models follow the instructions without the shouting.",[197,820,822],{"className":199,"code":821,"language":201,"meta":202,"style":202},"kubectl apply -f investigator-agent.yaml\nkubectl apply -f diagnostician-agent.yaml\nkubectl apply -f notifier-agent.yaml\n\nkubectl get swarmagents -n incident-response\n",[49,823,824,835,846,857,863],{"__ignoreMap":202},[206,825,826,828,830,832],{"class":208,"line":209},[206,827,244],{"class":212},[206,829,247],{"class":216},[206,831,251],{"class":250},[206,833,834],{"class":216}," investigator-agent.yaml\n",[206,836,837,839,841,843],{"class":208,"line":257},[206,838,244],{"class":212},[206,840,247],{"class":216},[206,842,251],{"class":250},[206,844,845],{"class":216}," diagnostician-agent.yaml\n",[206,847,848,850,852,854],{"class":208,"line":269},[206,849,244],{"class":212},[206,851,247],{"class":216},[206,853,251],{"class":250},[206,855,856],{"class":216}," notifier-agent.yaml\n",[206,858,859],{"class":208,"line":361},[206,860,862],{"emptyLinePlaceholder":861},true,"\n",[206,864,865,867,869,872,874],{"class":208,"line":369},[206,866,244],{"class":212},[206,868,290],{"class":216},[206,870,871],{"class":216}," swarmagents",[206,873,296],{"class":250},[206,875,299],{"class":216},[197,877,880],{"className":878,"code":879,"language":305},[303],"NAME                      MODEL        REPLICAS   READY   AGE\nincident-investigator     qwen2.5:7b   1          1       5s\nincident-diagnostician    qwen2.5:7b   1          1       5s\nincident-notifier         qwen2.5:7b   1          1       5s\n",[49,881,879],{"__ignoreMap":202},[99,883,885],{"id":884},"the-pipeline","The pipeline",[27,887,888],{},"A SwarmTeam wires the three agents into a DAG. Diagnostician waits for investigator,\nnotifier waits for diagnostician.",[197,890,892],{"className":327,"code":891,"language":329,"meta":202,"style":202},"# incident-team.yaml\nspec:\n  roles:\n    - name: investigator\n      swarmAgent: incident-investigator\n    - name: diagnostician\n      swarmAgent: incident-diagnostician\n    - name: notifier\n      swarmAgent: incident-notifier\n  pipeline:\n    - role: investigator\n      inputs:\n        alert: \"{{ .input.alert }}\"\n    - role: diagnostician\n      dependsOn: [investigator]\n      inputs:\n        investigation: \"{{ .steps.investigator.output }}\"\n        alert: \"{{ .input.alert }}\"\n    - role: notifier\n      dependsOn: [diagnostician]\n      inputs:\n        investigation: \"{{ .steps.investigator.output }}\"\n        diagnosis: \"{{ .steps.diagnostician.output }}\"\n",[49,893,894,899,905,912,924,934,945,954,965,974,981,992,999,1009,1019,1033,1039,1049,1057,1067,1078,1084,1092],{"__ignoreMap":202},[206,895,896],{"class":208,"line":209},[206,897,898],{"class":336},"# incident-team.yaml\n",[206,900,901,903],{"class":208,"line":257},[206,902,343],{"class":342},[206,904,347],{"class":346},[206,906,907,910],{"class":208,"line":269},[206,908,909],{"class":342},"  roles",[206,911,347],{"class":346},[206,913,914,917,919,921],{"class":208,"line":361},[206,915,916],{"class":346},"    - ",[206,918,421],{"class":342},[206,920,355],{"class":346},[206,922,923],{"class":216},"investigator\n",[206,925,926,929,931],{"class":208,"line":369},[206,927,928],{"class":342},"      swarmAgent",[206,930,355],{"class":346},[206,932,933],{"class":216},"incident-investigator\n",[206,935,936,938,940,942],{"class":208,"line":381},[206,937,916],{"class":346},[206,939,421],{"class":342},[206,941,355],{"class":346},[206,943,944],{"class":216},"diagnostician\n",[206,946,947,949,951],{"class":208,"line":387},[206,948,928],{"class":342},[206,950,355],{"class":346},[206,952,953],{"class":216},"incident-diagnostician\n",[206,955,956,958,960,962],{"class":208,"line":393},[206,957,916],{"class":346},[206,959,421],{"class":342},[206,961,355],{"class":346},[206,963,964],{"class":216},"notifier\n",[206,966,967,969,971],{"class":208,"line":399},[206,968,928],{"class":342},[206,970,355],{"class":346},[206,972,973],{"class":216},"incident-notifier\n",[206,975,976,979],{"class":208,"line":407},[206,977,978],{"class":342},"  pipeline",[206,980,347],{"class":346},[206,982,983,985,988,990],{"class":208,"line":415},[206,984,916],{"class":346},[206,986,987],{"class":342},"role",[206,989,355],{"class":346},[206,991,923],{"class":216},[206,993,994,997],{"class":208,"line":429},[206,995,996],{"class":342},"      inputs",[206,998,347],{"class":346},[206,1000,1001,1004,1006],{"class":208,"line":440},[206,1002,1003],{"class":342},"        alert",[206,1005,355],{"class":346},[206,1007,1008],{"class":216},"\"{{ .input.alert }}\"\n",[206,1010,1011,1013,1015,1017],{"class":208,"line":452},[206,1012,916],{"class":346},[206,1014,987],{"class":342},[206,1016,355],{"class":346},[206,1018,944],{"class":216},[206,1020,1021,1024,1027,1030],{"class":208,"line":462},[206,1022,1023],{"class":342},"      dependsOn",[206,1025,1026],{"class":346},": [",[206,1028,1029],{"class":216},"investigator",[206,1031,1032],{"class":346},"]\n",[206,1034,1035,1037],{"class":208,"line":470},[206,1036,996],{"class":342},[206,1038,347],{"class":346},[206,1040,1041,1044,1046],{"class":208,"line":478},[206,1042,1043],{"class":342},"        investigation",[206,1045,355],{"class":346},[206,1047,1048],{"class":216},"\"{{ .steps.investigator.output }}\"\n",[206,1050,1051,1053,1055],{"class":208,"line":489},[206,1052,1003],{"class":342},[206,1054,355],{"class":346},[206,1056,1008],{"class":216},[206,1058,1059,1061,1063,1065],{"class":208,"line":500},[206,1060,916],{"class":346},[206,1062,987],{"class":342},[206,1064,355],{"class":346},[206,1066,964],{"class":216},[206,1068,1069,1071,1073,1076],{"class":208,"line":508},[206,1070,1023],{"class":342},[206,1072,1026],{"class":346},[206,1074,1075],{"class":216},"diagnostician",[206,1077,1032],{"class":346},[206,1079,1080,1082],{"class":208,"line":516},[206,1081,996],{"class":342},[206,1083,347],{"class":346},[206,1085,1086,1088,1090],{"class":208,"line":525},[206,1087,1043],{"class":342},[206,1089,355],{"class":346},[206,1091,1048],{"class":216},[206,1093,1095,1098,1100],{"class":208,"line":1094},23,[206,1096,1097],{"class":342},"        diagnosis",[206,1099,355],{"class":346},[206,1101,1102],{"class":216},"\"{{ .steps.diagnostician.output }}\"\n",[27,1104,1105],{},"The notifier gets both the investigation and the diagnosis as input, so it has\nfull context when writing the Slack message.",[197,1107,1109],{"className":199,"code":1108,"language":201,"meta":202,"style":202},"kubectl apply -f incident-team.yaml\n",[49,1110,1111],{"__ignoreMap":202},[206,1112,1113,1115,1117,1119],{"class":208,"line":209},[206,1114,244],{"class":212},[206,1116,247],{"class":216},[206,1118,251],{"class":250},[206,1120,1121],{"class":216}," incident-team.yaml\n",[99,1123,1125],{"id":1124},"running-it","Running it",[27,1127,1128],{},"I fed it a simulated PagerDuty alert - high error rate on the payments API:",[197,1130,1132],{"className":327,"code":1131,"language":329,"meta":202,"style":202},"# sample-incident.yaml\nspec:\n  teamRef: incident-responder\n  input:\n    alert: |\n      PagerDuty Incident P-48291: High error rate on payments-api\n\n      Severity: High\n      Service: payments-api\n      Triggered: 2026-04-23T02:45:00Z\n      Description: Error rate on payments-api exceeded 5% threshold.\n      Current rate: 12.3% 5xx responses over the last 5 minutes.\n      Alert count: 47 alerts in the last 8 minutes\n",[49,1133,1134,1139,1145,1155,1162,1171,1176,1180,1185,1190,1195,1200,1205],{"__ignoreMap":202},[206,1135,1136],{"class":208,"line":209},[206,1137,1138],{"class":336},"# sample-incident.yaml\n",[206,1140,1141,1143],{"class":208,"line":257},[206,1142,343],{"class":342},[206,1144,347],{"class":346},[206,1146,1147,1150,1152],{"class":208,"line":269},[206,1148,1149],{"class":342},"  teamRef",[206,1151,355],{"class":346},[206,1153,1154],{"class":216},"incident-responder\n",[206,1156,1157,1160],{"class":208,"line":361},[206,1158,1159],{"class":342},"  input",[206,1161,347],{"class":346},[206,1163,1164,1167,1169],{"class":208,"line":369},[206,1165,1166],{"class":342},"    alert",[206,1168,355],{"class":346},[206,1170,378],{"class":377},[206,1172,1173],{"class":208,"line":381},[206,1174,1175],{"class":216},"      PagerDuty Incident P-48291: High error rate on payments-api\n",[206,1177,1178],{"class":208,"line":387},[206,1179,862],{"emptyLinePlaceholder":861},[206,1181,1182],{"class":208,"line":393},[206,1183,1184],{"class":216},"      Severity: High\n",[206,1186,1187],{"class":208,"line":399},[206,1188,1189],{"class":216},"      Service: payments-api\n",[206,1191,1192],{"class":208,"line":407},[206,1193,1194],{"class":216},"      Triggered: 2026-04-23T02:45:00Z\n",[206,1196,1197],{"class":208,"line":415},[206,1198,1199],{"class":216},"      Description: Error rate on payments-api exceeded 5% threshold.\n",[206,1201,1202],{"class":208,"line":429},[206,1203,1204],{"class":216},"      Current rate: 12.3% 5xx responses over the last 5 minutes.\n",[206,1206,1207],{"class":208,"line":440},[206,1208,1209],{"class":216},"      Alert count: 47 alerts in the last 8 minutes\n",[197,1211,1213],{"className":199,"code":1212,"language":201,"meta":202,"style":202},"kubectl apply -f sample-incident.yaml\nkubectl get swarmrun incident-001 -n incident-response -w\n",[49,1214,1215,1226],{"__ignoreMap":202},[206,1216,1217,1219,1221,1223],{"class":208,"line":209},[206,1218,244],{"class":212},[206,1220,247],{"class":216},[206,1222,251],{"class":250},[206,1224,1225],{"class":216}," sample-incident.yaml\n",[206,1227,1228,1230,1232,1235,1238,1240,1243],{"class":208,"line":257},[206,1229,244],{"class":212},[206,1231,290],{"class":216},[206,1233,1234],{"class":216}," swarmrun",[206,1236,1237],{"class":216}," incident-001",[206,1239,296],{"class":250},[206,1241,1242],{"class":216}," incident-response",[206,1244,1245],{"class":250}," -w\n",[197,1247,1250],{"className":1248,"code":1249,"language":305},[303],"NAME           PHASE      AGE\nincident-001   Pending    0s\nincident-001   Running    2s\nincident-001   Succeeded  2m18s\n",[49,1251,1249],{"__ignoreMap":202},[27,1253,1254],{},"Here's what each agent actually produced.",[27,1256,1257,1259,1260,1263,1264,1267,1268,1271],{},[30,1258,138],{}," - it called all three MCP tools (",[49,1261,1262],{},"get_incident",", ",[49,1265,1266],{},"query_logs",",\n",[49,1269,1270],{},"query_metrics",") and pulled together this report:",[197,1273,1277],{"className":1274,"code":1275,"language":1276,"meta":202,"style":202},"language-json shiki shiki-themes github-dark","{\n  \"incident_id\": \"P-48291\",\n  \"service\": \"payments-api\",\n  \"error_pattern\": \"Connection timeout and pool exhaustion errors involving PostgreSQL primary server failure\",\n  \"timeline\": \"Incident started at 02:47 UTC when a postgres failover occurred, leading to an error rate spike from 0.1% to 12.3% by 02:50 UTC\",\n  \"metrics_summary\": \"Error rate on payments-api exceeded baseline starting at 02:47 UTC during a PostgreSQL failover event, peaking at 12.3%\"\n}\n","json",[49,1278,1279,1284,1296,1308,1320,1332,1342],{"__ignoreMap":202},[206,1280,1281],{"class":208,"line":209},[206,1282,1283],{"class":346},"{\n",[206,1285,1286,1289,1291,1294],{"class":208,"line":257},[206,1287,1288],{"class":250},"  \"incident_id\"",[206,1290,355],{"class":346},[206,1292,1293],{"class":216},"\"P-48291\"",[206,1295,1267],{"class":346},[206,1297,1298,1301,1303,1306],{"class":208,"line":269},[206,1299,1300],{"class":250},"  \"service\"",[206,1302,355],{"class":346},[206,1304,1305],{"class":216},"\"payments-api\"",[206,1307,1267],{"class":346},[206,1309,1310,1313,1315,1318],{"class":208,"line":361},[206,1311,1312],{"class":250},"  \"error_pattern\"",[206,1314,355],{"class":346},[206,1316,1317],{"class":216},"\"Connection timeout and pool exhaustion errors involving PostgreSQL primary server failure\"",[206,1319,1267],{"class":346},[206,1321,1322,1325,1327,1330],{"class":208,"line":369},[206,1323,1324],{"class":250},"  \"timeline\"",[206,1326,355],{"class":346},[206,1328,1329],{"class":216},"\"Incident started at 02:47 UTC when a postgres failover occurred, leading to an error rate spike from 0.1% to 12.3% by 02:50 UTC\"",[206,1331,1267],{"class":346},[206,1333,1334,1337,1339],{"class":208,"line":381},[206,1335,1336],{"class":250},"  \"metrics_summary\"",[206,1338,355],{"class":346},[206,1340,1341],{"class":216},"\"Error rate on payments-api exceeded baseline starting at 02:47 UTC during a PostgreSQL failover event, peaking at 12.3%\"\n",[206,1343,1344],{"class":208,"line":387},[206,1345,1346],{"class":346},"}\n",[27,1348,1349],{},"I checked the audit logs - every data point traces back to the mock Grafana\nresponse. No hallucination.",[27,1351,1352,1354],{},[30,1353,154],{}," - correctly identified the root cause:",[197,1356,1358],{"className":1274,"code":1357,"language":1276,"meta":202,"style":202},"{\n  \"root_cause\": \"PostgreSQL primary server failure leading to database connection issues and exhausted connection pool\",\n  \"severity\": \"high\",\n  \"blast_radius\": \"payments-api service experiencing high error rates (12.3%)\",\n  \"remediation\": \"Promote the standby PostgreSQL replica to primary and restart the payments-api deployment to clear connection pool caches\",\n  \"runbook\": \"\",\n  \"confidence\": \"high\"\n}\n",[49,1359,1360,1364,1376,1388,1400,1412,1424,1434],{"__ignoreMap":202},[206,1361,1362],{"class":208,"line":209},[206,1363,1283],{"class":346},[206,1365,1366,1369,1371,1374],{"class":208,"line":257},[206,1367,1368],{"class":250},"  \"root_cause\"",[206,1370,355],{"class":346},[206,1372,1373],{"class":216},"\"PostgreSQL primary server failure leading to database connection issues and exhausted connection pool\"",[206,1375,1267],{"class":346},[206,1377,1378,1381,1383,1386],{"class":208,"line":269},[206,1379,1380],{"class":250},"  \"severity\"",[206,1382,355],{"class":346},[206,1384,1385],{"class":216},"\"high\"",[206,1387,1267],{"class":346},[206,1389,1390,1393,1395,1398],{"class":208,"line":361},[206,1391,1392],{"class":250},"  \"blast_radius\"",[206,1394,355],{"class":346},[206,1396,1397],{"class":216},"\"payments-api service experiencing high error rates (12.3%)\"",[206,1399,1267],{"class":346},[206,1401,1402,1405,1407,1410],{"class":208,"line":369},[206,1403,1404],{"class":250},"  \"remediation\"",[206,1406,355],{"class":346},[206,1408,1409],{"class":216},"\"Promote the standby PostgreSQL replica to primary and restart the payments-api deployment to clear connection pool caches\"",[206,1411,1267],{"class":346},[206,1413,1414,1417,1419,1422],{"class":208,"line":381},[206,1415,1416],{"class":250},"  \"runbook\"",[206,1418,355],{"class":346},[206,1420,1421],{"class":216},"\"\"",[206,1423,1267],{"class":346},[206,1425,1426,1429,1431],{"class":208,"line":387},[206,1427,1428],{"class":250},"  \"confidence\"",[206,1430,355],{"class":346},[206,1432,1433],{"class":216},"\"high\"\n",[206,1435,1436],{"class":208,"line":393},[206,1437,1346],{"class":346},[27,1439,1440,1441,1444],{},"The empty ",[49,1442,1443],{},"runbook"," field is correct - none was provided in the evidence, and I\nexplicitly told the model not to invent URLs. (It tried to on an earlier run.\nSmall models love making up links.)",[27,1446,1447,1449],{},[30,1448,170],{}," - the mock Slack server logged this:",[197,1451,1453],{"className":199,"code":1452,"language":201,"meta":202,"style":202},"kubectl logs -n incident-response deploy\u002Fmock-slack\n",[49,1454,1455],{"__ignoreMap":202},[206,1456,1457,1459,1462,1464,1466],{"class":208,"line":209},[206,1458,244],{"class":212},[206,1460,1461],{"class":216}," logs",[206,1463,296],{"class":250},[206,1465,1242],{"class":216},[206,1467,1468],{"class":216}," deploy\u002Fmock-slack\n",[197,1470,1473],{"className":1471,"code":1472,"language":305},[303],"========================================\nSLACK MESSAGE to #incidents\n========================================\n*Incident Summary*\n**Service:** payments-api\n**Root Cause:** PostgreSQL primary server failure leading to\ndatabase connection issues and exhausted connection pool\n**Severity:** High\n**Blast Radius:** payments-api service experiencing high error rates (12.3%)\n*Remediation Steps:*\n- Promote the standby PostgreSQL replica to primary\n- Restart the payments-api deployment\n- Clear connection pool caches\n- Monitor for any persistent issues\n========================================\n",[49,1474,1472],{"__ignoreMap":202},[27,1476,1477],{},"And the PagerDuty mock logged the acknowledgment. Both tools called, both\nactions completed.",[99,1479,1481],{"id":1480},"policy","Policy",[27,1483,1484],{},"Once this works, you want to make sure nobody deploys a rogue agent with\nshell access in this namespace:",[197,1486,1488],{"className":327,"code":1487,"language":329,"meta":202,"style":202},"# incident-policy.yaml\nspec:\n  enforcementMode: Enforce\n  limits:\n    maxDailyTokens: 200000\n    maxTokensPerCall: 3000\n    maxTimeoutSeconds: 600\n  tools:\n    deny:\n      - \"shell\u002F*\"\n      - \"filesystem\u002F*\"\n  models:\n    allowed:\n      - \"qwen*\"\n      - \"llama*\"\n",[49,1489,1490,1495,1501,1511,1518,1528,1537,1547,1553,1560,1567,1574,1581,1588,1595],{"__ignoreMap":202},[206,1491,1492],{"class":208,"line":209},[206,1493,1494],{"class":336},"# incident-policy.yaml\n",[206,1496,1497,1499],{"class":208,"line":257},[206,1498,343],{"class":342},[206,1500,347],{"class":346},[206,1502,1503,1506,1508],{"class":208,"line":269},[206,1504,1505],{"class":342},"  enforcementMode",[206,1507,355],{"class":346},[206,1509,1510],{"class":216},"Enforce\n",[206,1512,1513,1516],{"class":208,"line":361},[206,1514,1515],{"class":342},"  limits",[206,1517,347],{"class":346},[206,1519,1520,1523,1525],{"class":208,"line":369},[206,1521,1522],{"class":342},"    maxDailyTokens",[206,1524,355],{"class":346},[206,1526,1527],{"class":250},"200000\n",[206,1529,1530,1533,1535],{"class":208,"line":381},[206,1531,1532],{"class":342},"    maxTokensPerCall",[206,1534,355],{"class":346},[206,1536,633],{"class":250},[206,1538,1539,1542,1544],{"class":208,"line":387},[206,1540,1541],{"class":342},"    maxTimeoutSeconds",[206,1543,355],{"class":346},[206,1545,1546],{"class":250},"600\n",[206,1548,1549,1551],{"class":208,"line":393},[206,1550,402],{"class":342},[206,1552,347],{"class":346},[206,1554,1555,1558],{"class":208,"line":399},[206,1556,1557],{"class":342},"    deny",[206,1559,347],{"class":346},[206,1561,1562,1564],{"class":208,"line":407},[206,1563,418],{"class":346},[206,1565,1566],{"class":216},"\"shell\u002F*\"\n",[206,1568,1569,1571],{"class":208,"line":415},[206,1570,418],{"class":346},[206,1572,1573],{"class":216},"\"filesystem\u002F*\"\n",[206,1575,1576,1579],{"class":208,"line":429},[206,1577,1578],{"class":342},"  models",[206,1580,347],{"class":346},[206,1582,1583,1586],{"class":208,"line":440},[206,1584,1585],{"class":342},"    allowed",[206,1587,347],{"class":346},[206,1589,1590,1592],{"class":208,"line":452},[206,1591,418],{"class":346},[206,1593,1594],{"class":216},"\"qwen*\"\n",[206,1596,1597,1599],{"class":208,"line":462},[206,1598,418],{"class":346},[206,1600,1601],{"class":216},"\"llama*\"\n",[107,1603,1604,1614],{},[110,1605,1606],{},[113,1607,1608,1611],{},[116,1609,1610],{},"Rule",[116,1612,1613],{},"Why",[129,1615,1616,1626,1636,1646],{},[113,1617,1618,1623],{},[134,1619,1620],{},[49,1621,1622],{},"tools.deny: shell\u002F*, filesystem\u002F*",[134,1624,1625],{},"Agents can use MCP tools but never execute commands or write files",[113,1627,1628,1633],{},[134,1629,1630],{},[49,1631,1632],{},"models.allowed: qwen*, llama*",[134,1634,1635],{},"Only approved local models - no surprise API bills",[113,1637,1638,1643],{},[134,1639,1640],{},[49,1641,1642],{},"maxDailyTokens: 200000",[134,1644,1645],{},"200K tokens\u002Fday ceiling - enough for ~20 incidents",[113,1647,1648,1653],{},[134,1649,1650],{},[49,1651,1652],{},"maxTokensPerCall: 3000",[134,1654,1655],{},"No single LLM call burns more than 3K tokens",[99,1657,1659],{"id":1658},"going-further","Going further",[27,1661,1662],{},"To connect real services, swap the mock URLs:",[197,1664,1666],{"className":327,"code":1665,"language":329,"meta":202,"style":202},"tools:\n  mcp:\n    - name: pagerduty\n      url: \"http:\u002F\u002Fpagerduty-mcp.incident-response.svc:8080\"\n      auth:\n        type: bearer\n        secretRef:\n          name: pagerduty-api-key\n",[49,1667,1668,1674,1681,1691,1701,1708,1718,1725],{"__ignoreMap":202},[206,1669,1670,1672],{"class":208,"line":209},[206,1671,648],{"class":342},[206,1673,347],{"class":346},[206,1675,1676,1679],{"class":208,"line":257},[206,1677,1678],{"class":342},"  mcp",[206,1680,347],{"class":346},[206,1682,1683,1685,1687,1689],{"class":208,"line":269},[206,1684,916],{"class":346},[206,1686,421],{"class":342},[206,1688,355],{"class":346},[206,1690,426],{"class":216},[206,1692,1693,1696,1698],{"class":208,"line":361},[206,1694,1695],{"class":342},"      url",[206,1697,355],{"class":346},[206,1699,1700],{"class":216},"\"http:\u002F\u002Fpagerduty-mcp.incident-response.svc:8080\"\n",[206,1702,1703,1706],{"class":208,"line":369},[206,1704,1705],{"class":342},"      auth",[206,1707,347],{"class":346},[206,1709,1710,1713,1715],{"class":208,"line":381},[206,1711,1712],{"class":342},"        type",[206,1714,355],{"class":346},[206,1716,1717],{"class":216},"bearer\n",[206,1719,1720,1723],{"class":208,"line":387},[206,1721,1722],{"class":342},"        secretRef",[206,1724,347],{"class":346},[206,1726,1727,1730,1732],{"class":208,"line":393},[206,1728,1729],{"class":342},"          name",[206,1731,355],{"class":346},[206,1733,1734],{"class":216},"pagerduty-api-key\n",[27,1736,1737,1738,1741],{},"To trigger automatically from PagerDuty webhooks instead of ",[49,1739,1740],{},"kubectl apply",",\nuse a SwarmEvent:",[197,1743,1745],{"className":327,"code":1744,"language":329,"meta":202,"style":202},"spec:\n  source:\n    type: webhook\n  targets:\n    - team: incident-responder\n      inputs:\n        alert: \"{{ .trigger.body.incident.title }}: {{ .trigger.body.incident.description }}\"\n  concurrencyPolicy: Allow\n",[49,1746,1747,1753,1760,1770,1777,1788,1794,1803],{"__ignoreMap":202},[206,1748,1749,1751],{"class":208,"line":209},[206,1750,343],{"class":342},[206,1752,347],{"class":346},[206,1754,1755,1758],{"class":208,"line":257},[206,1756,1757],{"class":342},"  source",[206,1759,347],{"class":346},[206,1761,1762,1765,1767],{"class":208,"line":269},[206,1763,1764],{"class":342},"    type",[206,1766,355],{"class":346},[206,1768,1769],{"class":216},"webhook\n",[206,1771,1772,1775],{"class":208,"line":361},[206,1773,1774],{"class":342},"  targets",[206,1776,347],{"class":346},[206,1778,1779,1781,1784,1786],{"class":208,"line":369},[206,1780,916],{"class":346},[206,1782,1783],{"class":342},"team",[206,1785,355],{"class":346},[206,1787,1154],{"class":216},[206,1789,1790,1792],{"class":208,"line":381},[206,1791,996],{"class":342},[206,1793,347],{"class":346},[206,1795,1796,1798,1800],{"class":208,"line":387},[206,1797,1003],{"class":342},[206,1799,355],{"class":346},[206,1801,1802],{"class":216},"\"{{ .trigger.body.incident.title }}: {{ .trigger.body.incident.description }}\"\n",[206,1804,1805,1808,1810],{"class":208,"line":393},[206,1806,1807],{"class":342},"  concurrencyPolicy",[206,1809,355],{"class":346},[206,1811,1812],{"class":216},"Allow\n",[27,1814,1815],{},"The operator generates a webhook URL. Point PagerDuty at it and every incident\nfires the pipeline.",[99,1817,1819],{"id":1818},"numbers","Numbers",[107,1821,1822,1832],{},[110,1823,1824],{},[113,1825,1826,1829],{},[116,1827,1828],{},"Metric",[116,1830,1831],{},"Value",[129,1833,1834,1844,1854,1864,1874,1884,1894],{},[113,1835,1836,1841],{},[134,1837,1838],{},[30,1839,1840],{},"Agents",[134,1842,1843],{},"3 (investigator + diagnostician + notifier)",[113,1845,1846,1851],{},[134,1847,1848],{},[30,1849,1850],{},"Model",[134,1852,1853],{},"qwen2.5:7b (any model works)",[113,1855,1856,1861],{},[134,1857,1858],{},[30,1859,1860],{},"Time per incident",[134,1862,1863],{},"~2.5 minutes",[113,1865,1866,1871],{},[134,1867,1868],{},[30,1869,1870],{},"Tokens per incident",[134,1872,1873],{},"~10,000",[113,1875,1876,1881],{},[134,1877,1878],{},[30,1879,1880],{},"Cost per incident",[134,1882,1883],{},"$0.00 (local model)",[113,1885,1886,1891],{},[134,1887,1888],{},[30,1889,1890],{},"Tools",[134,1892,1893],{},"PagerDuty, Grafana, Slack via MCP",[113,1895,1896,1901],{},[134,1897,1898],{},[30,1899,1900],{},"Guardrails",[134,1902,1903],{},"Per-agent tool allowlists, namespace policy, token budgets",[27,1905,1906],{},"The 2.5 minutes is mostly Ollama thinking on my laptop. With a faster model\nor GPU, this would be well under a minute.",[99,1908,1910],{"id":1909},"cleanup","Cleanup",[197,1912,1914],{"className":199,"code":1913,"language":201,"meta":202,"style":202},"kubectl delete namespace incident-response\n",[49,1915,1916],{"__ignoreMap":202},[206,1917,1918,1920,1923,1926],{"class":208,"line":209},[206,1919,244],{"class":212},[206,1921,1922],{"class":216}," delete",[206,1924,1925],{"class":216}," namespace",[206,1927,299],{"class":216},[27,1929,1930,1931,1935,1936,1941],{},"All the files are in the\n",[35,1932,1934],{"href":37,"rel":1933},[39],"cookbook",".\nThe ",[35,1937,1940],{"href":1938,"rel":1939},"https:\u002F\u002Fdocs.kubeswarm.io",[39],"docs"," have more on MCP integration and SwarmEvents.",[43,1943],{},[27,1945,1946],{},[1947,1948,1949,1950,1954,1955,1954,1960],"em",{},"kubeswarm is an open-source Kubernetes operator for managing AI agents.\n",[35,1951,1953],{"href":1938,"rel":1952},[39],"Docs"," |\n",[35,1956,1959],{"href":1957,"rel":1958},"https:\u002F\u002Fgithub.com\u002Fkubeswarm\u002Fkubeswarm-cookbook",[39],"Cookbook",[35,1961,1964],{"href":1962,"rel":1963},"https:\u002F\u002Fgithub.com\u002Fkubeswarm\u002Fkubeswarm",[39],"GitHub",[1966,1967,1968],"style",{},"html pre.shiki code .svObZ, html code.shiki .svObZ{--shiki-default:#B392F0}html pre.shiki code .sU2Wk, html code.shiki .sU2Wk{--shiki-default:#9ECBFF}html .default .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html .shiki span {color: var(--shiki-default);background: var(--shiki-default-bg);font-style: var(--shiki-default-font-style);font-weight: var(--shiki-default-font-weight);text-decoration: var(--shiki-default-text-decoration);}html pre.shiki code .sDLfK, html code.shiki .sDLfK{--shiki-default:#79B8FF}html pre.shiki code .sAwPA, html code.shiki .sAwPA{--shiki-default:#6A737D}html pre.shiki code .s4JwU, html code.shiki .s4JwU{--shiki-default:#85E89D}html pre.shiki code .s95oV, html code.shiki .s95oV{--shiki-default:#E1E4E8}html pre.shiki code .snl16, html code.shiki .snl16{--shiki-default:#F97583}",{"title":202,"searchDepth":257,"depth":257,"links":1970},[1971,1972,1973,1974,1975,1976,1977,1978,1979,1980],{"id":101,"depth":257,"text":102},{"id":185,"depth":257,"text":186},{"id":223,"depth":257,"text":224},{"id":318,"depth":257,"text":319},{"id":884,"depth":257,"text":885},{"id":1124,"depth":257,"text":1125},{"id":1480,"depth":257,"text":1481},{"id":1658,"depth":257,"text":1659},{"id":1818,"depth":257,"text":1819},{"id":1909,"depth":257,"text":1910},"tutorial","2026-04-25","Three AI agents investigate a PagerDuty alert, diagnose the root cause from Grafana logs, and post findings to Slack - all on a local cluster.","md",{},"\u002Fblog\u002Fincident-response",{"title":17,"description":1983},null,"blog\u002Fincident-response",[1991,1992,1993],"incident-response","multi-agent","mcp","kwhHJ9YejmpAWobp2aIReM6geYjPitZAV4Ihk4F_KK0",1777113023353]