Initial Commit

2026-01-30 03:48:20 +00:00 · 2025-06-20 13:53:06 +01:00
commit 19872d51cb
6 changed files with 497 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,176 @@
+# Created by https://www.toptal.com/developers/gitignore/api/python
+# Edit at https://www.toptal.com/developers/gitignore?templates=python
+
+### Python ###
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+
+### Python Patch ###
+# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
+poetry.toml
+
+# ruff
+.ruff_cache/
+
+# LSP config files
+pyrightconfig.json
+
+# End of https://www.toptal.com/developers/gitignore/api/python
--- a/main.tf
+++ b/main.tf
@@ -0,0 +1,102 @@
+terraform {
+  # We're using the value_wo featuers that exist in TF 1.11 or later
+  required_version = ">= 1.11.0"
+
+  required_providers {
+    aws = {
+      source  = "hashicorp/aws"
+      version = "~> 5.0"
+    }
+  }
+}
+
+data "aws_caller_identity" "current" {}
+
+module "function" {
+  source  = "terraform-aws-modules/lambda/aws"
+  version = "7.21.0"
+
+  function_name = "feedsummarizer-${var.environment}"
+  description   = "Slack RSS Feed Summarizer"
+  runtime       = "python3.13"
+  handler       = "summarizer.lambda_handler"
+  timeout       = 300 # 5 minutes
+
+  build_in_docker = true
+  architectures   = ["arm64"]
+
+  source_path = [{
+    path             = "${path.module}/src"
+    pip_requirements = true
+    dockerize_pip    = true
+  }]
+
+  environment_variables = {
+    BEDROCK_MODEL = var.bedrock_model
+
+    PARAM_SLACK_WEBHOOK = aws_ssm_parameter.webhook_url.name
+
+    POWERTOOLS_LOG_LEVEL    = var.log_level
+    POWERTOOLS_SERVICE_NAME = "feedsummarizer"
+  }
+
+  attach_policy_statements = true
+  policy_statements = {
+    AllowBedrock = {
+      effect    = "Allow"
+      actions   = ["bedrock:InvokeModel"]
+      resources = ["*"] # TODO: Lock down to just the selected model
+    }
+
+    AllowSsmParameters = {
+      effect  = "Allow"
+      actions = ["ssm:GetParameter"]
+      resources = [
+        aws_ssm_parameter.webhook_url.arn,
+      ]
+    }
+  }
+}
+
+resource "aws_ssm_parameter" "webhook_url" {
+  name = "/feedsummarizer/${var.environment}/slack-webhook-url"
+  type = "SecureString"
+
+  # Seed value
+  value_wo         = "not-set"
+  value_wo_version = 1
+}
+
+module "scheduler_role" {
+  source  = "terraform-aws-modules/iam/aws//modules/iam-assumable-role"
+  version = "5.58.0"
+
+  trusted_role_services = ["scheduler.amazonaws.com"]
+
+  create_role       = true
+  role_requires_mfa = false # This is a service role, not for humans
+  role_name         = "feedsummarizer-${var.environment}-scheduler"
+
+  inline_policy_statements = [{
+    sid = "AllowTriggerLambda"
+    actions = [
+      "lambda:InvokeFunction"
+    ]
+    resources = [module.function.lambda_function_arn]
+  }]
+}
+
+resource "aws_scheduler_schedule" "this" {
+  name                         = "feedsummarizer-${var.environment}"
+  schedule_expression          = var.schedule_expression
+  schedule_expression_timezone = var.schedule_expression_timezone
+
+  flexible_time_window {
+    mode = "OFF"
+  }
+
+  target {
+    arn      = module.function.lambda_function_arn
+    role_arn = module.scheduler_role.iam_role_arn
+  }
+}
--- a/src/models.py
+++ b/src/models.py
@@ -0,0 +1,34 @@
+from datetime import datetime
+
+from pydantic import BaseModel, Field
+
+
+class MessageCategoryEntry(BaseModel):
+    headline: str = Field(
+        description="Blog-post headline/title/summary for this update. This can be up to two sentences long and should be reasonably descriptive."
+    )
+    link: str = Field(description="Link to the source for this update")
+
+
+class MessageCategory(BaseModel):
+    emoji: str = Field(description="Slack-Coded Emoji for this category")
+    title: str = Field(description="Title for this category of updates")
+    entries: list[MessageCategoryEntry] = Field(
+        description="Individual entries within this category"
+    )
+
+
+class SlackMessage(BaseModel):
+    categories: list[MessageCategory]
+
+
+class FeedItem(BaseModel):
+    title: str
+    description: str
+    pub_date: str
+    link: str
+
+    @property
+    def timestamp(self):
+        parsed = datetime.strptime(self.pub_date, "%a, %d %b %Y %H:%M:%S %Z")
+        return int(parsed.timestamp())
--- a/src/requirements.txt
+++ b/src/requirements.txt
@@ -0,0 +1,7 @@
+aws_lambda_powertools~=3.14
+boto3~=1.38
+langchain[aws]~=0.3.25
+pydantic~=2.11
+requests~=2.32
+rss-parser~=2.1
+slack_sdk~=3.35
--- a/src/summarizer.py
+++ b/src/summarizer.py
@@ -0,0 +1,149 @@
+import os
+from datetime import datetime, timedelta
+from html.parser import HTMLParser
+from io import StringIO
+
+import requests
+from aws_lambda_powertools import Logger
+from aws_lambda_powertools.utilities import parameters
+from aws_lambda_powertools.utilities.typing import LambdaContext
+from langchain_core.messages import SystemMessage, HumanMessage
+from langchain_aws import ChatBedrockConverse
+from rss_parser import RSSParser
+from slack_sdk.webhook import WebhookClient
+
+from models import FeedItem, SlackMessage
+
+# Configurable
+MODEL = os.environ["BEDROCK_MODEL"]
+MAXLEN_DESCRIPTION = int(os.environ.get("MAXLEN_DESCRIPTION", "2500"))
+PARAM_SLACK_WEBHOOK = os.environ["PARAM_SLACK_WEBHOOK"]
+USER_AGENT = os.environ.get("USER_AGENT", "feedsummarizer 0.1 +harry@reeder.dev")
+
+# TODO: Make FEED_URL and SYSTEM_PROMPT configurable
+FEED_URL = "https://aws.amazon.com/about-aws/whats-new/recent/feed/"
+SYSTEM_PROMPT = """Using the following entries from the AWS "What's New" feed, create a once-per-week message to post to slack to keep people up to date with what they should know.
+
+Ensure that anything which is in preview is clearly marked as such.
+
+Categories should have relevant emoji, and individual entries should have a "Learn More" link to the article's page.
+"""
+
+LLM = ChatBedrockConverse(
+    model=MODEL,
+    temperature=0.5,
+    max_tokens=None,
+)
+
+SLACK_WEBHOOK_URL = parameters.get_parameter(PARAM_SLACK_WEBHOOK, decrypt=True)
+
+log = Logger()
+
+
+# From: https://stackoverflow.com/a/925630
+class MLStripper(HTMLParser):
+    def __init__(self):
+        super().__init__()
+        self.reset()
+        self.strict = False
+        self.convert_charrefs = True
+        self.text = StringIO()
+
+    def handle_data(self, d):
+        self.text.write(d)
+
+    def get_data(self):
+        return self.text.getvalue()
+
+
+def strip_tags(html: str) -> str:
+    s = MLStripper()
+    s.feed(html)
+    return s.get_data()
+
+
+def get_feed_entries(feed_url: str, since: int) -> list[FeedItem]:
+    response = requests.get(feed_url, headers={"User-Agent": USER_AGENT})
+    response_text = response.text
+    log.debug(
+        "got feed",
+        extra={
+            "feed_url": feed_url,
+            "response": {
+                "status_code": response.status_code,
+                "headers": dict(response.headers),
+            },
+        },
+    )
+    feed = RSSParser.parse(response_text)
+
+    items = [
+        FeedItem(
+            title=it.title.content,
+            description=strip_tags(it.description.content)[:MAXLEN_DESCRIPTION],
+            pub_date=it.pub_date.content,
+            link=it.links[0].content,
+        )
+        for it in feed.channel.items
+    ]
+
+    items = [it for it in items if it.timestamp > since]
+    oldest = min(items, key=lambda it: it.timestamp)
+
+    log.debug(
+        "filtered items",
+        extra={
+            "item_count": len(items),
+            "maxlen_description": MAXLEN_DESCRIPTION,
+            "oldest": oldest.pub_date,
+        },
+    )
+
+    return items
+
+
+def get_summary_message(feed_items: list[FeedItem]) -> SlackMessage:
+    messages = [
+        SystemMessage(SYSTEM_PROMPT),
+        HumanMessage("\n".join([it.model_dump_json() for it in feed_items])),
+    ]
+    model = LLM.with_structured_output(SlackMessage)
+    log.debug("invoking model", extra={"model": MODEL})
+    response = model.invoke(messages)
+    log.debug("got response")
+
+    return response
+
+
+def send_to_slack(message: SlackMessage):
+    client = WebhookClient(SLACK_WEBHOOK_URL)
+
+    for category in message.categories:
+        slack_message = f"{category.emoji} *{category.title}*"
+        for entry in category.entries:
+            slack_message += f"\n • {entry.headline} - <{entry.link}|Learn More>"
+
+        client.send(text=slack_message)
+
+
+def main():
+    if SLACK_WEBHOOK_URL == "not-set":
+        raise ValueError(
+            f"No Slack Webhook URL is currently set in SSM at {os.environ['PARAM_SLACK_WEBHOOK']}"
+        )
+
+    now = datetime.now()
+    last_week = now - timedelta(weeks=1)
+
+    entries = get_feed_entries(FEED_URL, since=int(last_week.timestamp()))
+    message = get_summary_message(entries)
+    send_to_slack(message)
+
+
+@log.inject_lambda_context
+def lambda_handler(event: dict, context: LambdaContext):
+    main()
+
+
+if __name__ == "__main__":
+    main()
--- a/variables.tf
+++ b/variables.tf
@@ -0,0 +1,29 @@
+variable "environment" {
+  type        = string
+  default     = "prod"
+  description = "Environment name to suffix to resources"
+}
+
+variable "log_level" {
+  type        = string
+  default     = "INFO"
+  description = "Lambda Log Level"
+}
+
+variable "bedrock_model" {
+  default     = "eu.anthropic.claude-sonnet-4-20250514-v1:0"
+  type        = string
+  description = "Which bedrock model ID to use"
+}
+
+variable "schedule_expression" {
+  type        = string
+  description = "AWS EventBridge Scheduler Expression for when to run"
+  default     = "cron(30 8 ? * FRI *)"
+}
+
+variable "schedule_expression_timezone" {
+  type        = string
+  description = "Timezone for the schedule expression"
+  default     = "UTC"
+}