commit 19872d51cb3e59965a8cb86c578d21ccd2b5b465 Author: Harry Reeder Date: Fri Jun 20 13:53:06 2025 +0100 Initial Commit diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..ad4a1f1 --- /dev/null +++ b/.gitignore @@ -0,0 +1,176 @@ +# Created by https://www.toptal.com/developers/gitignore/api/python +# Edit at https://www.toptal.com/developers/gitignore?templates=python + +### Python ### +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm.fming.dev/#use-with-ide +.pdm.toml + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ + +### Python Patch ### +# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration +poetry.toml + +# ruff +.ruff_cache/ + +# LSP config files +pyrightconfig.json + +# End of https://www.toptal.com/developers/gitignore/api/python diff --git a/main.tf b/main.tf new file mode 100644 index 0000000..2e1f28b --- /dev/null +++ b/main.tf @@ -0,0 +1,102 @@ +terraform { + # We're using the value_wo featuers that exist in TF 1.11 or later + required_version = ">= 1.11.0" + + required_providers { + aws = { + source = "hashicorp/aws" + version = "~> 5.0" + } + } +} + +data "aws_caller_identity" "current" {} + +module "function" { + source = "terraform-aws-modules/lambda/aws" + version = "7.21.0" + + function_name = "feedsummarizer-${var.environment}" + description = "Slack RSS Feed Summarizer" + runtime = "python3.13" + handler = "summarizer.lambda_handler" + timeout = 300 # 5 minutes + + build_in_docker = true + architectures = ["arm64"] + + source_path = [{ + path = "${path.module}/src" + pip_requirements = true + dockerize_pip = true + }] + + environment_variables = { + BEDROCK_MODEL = var.bedrock_model + + PARAM_SLACK_WEBHOOK = aws_ssm_parameter.webhook_url.name + + POWERTOOLS_LOG_LEVEL = var.log_level + POWERTOOLS_SERVICE_NAME = "feedsummarizer" + } + + attach_policy_statements = true + policy_statements = { + AllowBedrock = { + effect = "Allow" + actions = ["bedrock:InvokeModel"] + resources = ["*"] # TODO: Lock down to just the selected model + } + + AllowSsmParameters = { + effect = "Allow" + actions = ["ssm:GetParameter"] + resources = [ + aws_ssm_parameter.webhook_url.arn, + ] + } + } +} + +resource "aws_ssm_parameter" "webhook_url" { + name = "/feedsummarizer/${var.environment}/slack-webhook-url" + type = "SecureString" + + # Seed value + value_wo = "not-set" + value_wo_version = 1 +} + +module "scheduler_role" { + source = "terraform-aws-modules/iam/aws//modules/iam-assumable-role" + version = "5.58.0" + + trusted_role_services = ["scheduler.amazonaws.com"] + + create_role = true + role_requires_mfa = false # This is a service role, not for humans + role_name = "feedsummarizer-${var.environment}-scheduler" + + inline_policy_statements = [{ + sid = "AllowTriggerLambda" + actions = [ + "lambda:InvokeFunction" + ] + resources = [module.function.lambda_function_arn] + }] +} + +resource "aws_scheduler_schedule" "this" { + name = "feedsummarizer-${var.environment}" + schedule_expression = var.schedule_expression + schedule_expression_timezone = var.schedule_expression_timezone + + flexible_time_window { + mode = "OFF" + } + + target { + arn = module.function.lambda_function_arn + role_arn = module.scheduler_role.iam_role_arn + } +} diff --git a/src/models.py b/src/models.py new file mode 100644 index 0000000..6a1b686 --- /dev/null +++ b/src/models.py @@ -0,0 +1,34 @@ +from datetime import datetime + +from pydantic import BaseModel, Field + + +class MessageCategoryEntry(BaseModel): + headline: str = Field( + description="Blog-post headline/title/summary for this update. This can be up to two sentences long and should be reasonably descriptive." + ) + link: str = Field(description="Link to the source for this update") + + +class MessageCategory(BaseModel): + emoji: str = Field(description="Slack-Coded Emoji for this category") + title: str = Field(description="Title for this category of updates") + entries: list[MessageCategoryEntry] = Field( + description="Individual entries within this category" + ) + + +class SlackMessage(BaseModel): + categories: list[MessageCategory] + + +class FeedItem(BaseModel): + title: str + description: str + pub_date: str + link: str + + @property + def timestamp(self): + parsed = datetime.strptime(self.pub_date, "%a, %d %b %Y %H:%M:%S %Z") + return int(parsed.timestamp()) diff --git a/src/requirements.txt b/src/requirements.txt new file mode 100644 index 0000000..e73ac28 --- /dev/null +++ b/src/requirements.txt @@ -0,0 +1,7 @@ +aws_lambda_powertools~=3.14 +boto3~=1.38 +langchain[aws]~=0.3.25 +pydantic~=2.11 +requests~=2.32 +rss-parser~=2.1 +slack_sdk~=3.35 diff --git a/src/summarizer.py b/src/summarizer.py new file mode 100644 index 0000000..bb46a65 --- /dev/null +++ b/src/summarizer.py @@ -0,0 +1,149 @@ +import os +from datetime import datetime, timedelta +from html.parser import HTMLParser +from io import StringIO + +import requests +from aws_lambda_powertools import Logger +from aws_lambda_powertools.utilities import parameters +from aws_lambda_powertools.utilities.typing import LambdaContext +from langchain_core.messages import SystemMessage, HumanMessage +from langchain_aws import ChatBedrockConverse +from rss_parser import RSSParser +from slack_sdk.webhook import WebhookClient + +from models import FeedItem, SlackMessage + +# Configurable +MODEL = os.environ["BEDROCK_MODEL"] +MAXLEN_DESCRIPTION = int(os.environ.get("MAXLEN_DESCRIPTION", "2500")) +PARAM_SLACK_WEBHOOK = os.environ["PARAM_SLACK_WEBHOOK"] +USER_AGENT = os.environ.get("USER_AGENT", "feedsummarizer 0.1 +harry@reeder.dev") + +# TODO: Make FEED_URL and SYSTEM_PROMPT configurable +FEED_URL = "https://aws.amazon.com/about-aws/whats-new/recent/feed/" +SYSTEM_PROMPT = """Using the following entries from the AWS "What's New" feed, create a once-per-week message to post to slack to keep people up to date with what they should know. + +Ensure that anything which is in preview is clearly marked as such. + +Categories should have relevant emoji, and individual entries should have a "Learn More" link to the article's page. +""" + +LLM = ChatBedrockConverse( + model=MODEL, + temperature=0.5, + max_tokens=None, +) + +SLACK_WEBHOOK_URL = parameters.get_parameter(PARAM_SLACK_WEBHOOK, decrypt=True) + +log = Logger() + + +# From: https://stackoverflow.com/a/925630 +class MLStripper(HTMLParser): + def __init__(self): + super().__init__() + self.reset() + self.strict = False + self.convert_charrefs = True + self.text = StringIO() + + def handle_data(self, d): + self.text.write(d) + + def get_data(self): + return self.text.getvalue() + + +def strip_tags(html: str) -> str: + s = MLStripper() + s.feed(html) + return s.get_data() + + +def get_feed_entries(feed_url: str, since: int) -> list[FeedItem]: + response = requests.get(feed_url, headers={"User-Agent": USER_AGENT}) + response_text = response.text + log.debug( + "got feed", + extra={ + "feed_url": feed_url, + "response": { + "status_code": response.status_code, + "headers": dict(response.headers), + }, + }, + ) + feed = RSSParser.parse(response_text) + + items = [ + FeedItem( + title=it.title.content, + description=strip_tags(it.description.content)[:MAXLEN_DESCRIPTION], + pub_date=it.pub_date.content, + link=it.links[0].content, + ) + for it in feed.channel.items + ] + + items = [it for it in items if it.timestamp > since] + oldest = min(items, key=lambda it: it.timestamp) + + log.debug( + "filtered items", + extra={ + "item_count": len(items), + "maxlen_description": MAXLEN_DESCRIPTION, + "oldest": oldest.pub_date, + }, + ) + + return items + + +def get_summary_message(feed_items: list[FeedItem]) -> SlackMessage: + messages = [ + SystemMessage(SYSTEM_PROMPT), + HumanMessage("\n".join([it.model_dump_json() for it in feed_items])), + ] + model = LLM.with_structured_output(SlackMessage) + log.debug("invoking model", extra={"model": MODEL}) + response = model.invoke(messages) + log.debug("got response") + + return response + + +def send_to_slack(message: SlackMessage): + client = WebhookClient(SLACK_WEBHOOK_URL) + + for category in message.categories: + slack_message = f"{category.emoji} *{category.title}*" + for entry in category.entries: + slack_message += f"\n • {entry.headline} - <{entry.link}|Learn More>" + + client.send(text=slack_message) + + +def main(): + if SLACK_WEBHOOK_URL == "not-set": + raise ValueError( + f"No Slack Webhook URL is currently set in SSM at {os.environ['PARAM_SLACK_WEBHOOK']}" + ) + + now = datetime.now() + last_week = now - timedelta(weeks=1) + + entries = get_feed_entries(FEED_URL, since=int(last_week.timestamp())) + message = get_summary_message(entries) + send_to_slack(message) + + +@log.inject_lambda_context +def lambda_handler(event: dict, context: LambdaContext): + main() + + +if __name__ == "__main__": + main() diff --git a/variables.tf b/variables.tf new file mode 100644 index 0000000..4960744 --- /dev/null +++ b/variables.tf @@ -0,0 +1,29 @@ +variable "environment" { + type = string + default = "prod" + description = "Environment name to suffix to resources" +} + +variable "log_level" { + type = string + default = "INFO" + description = "Lambda Log Level" +} + +variable "bedrock_model" { + default = "eu.anthropic.claude-sonnet-4-20250514-v1:0" + type = string + description = "Which bedrock model ID to use" +} + +variable "schedule_expression" { + type = string + description = "AWS EventBridge Scheduler Expression for when to run" + default = "cron(30 8 ? * FRI *)" +} + +variable "schedule_expression_timezone" { + type = string + description = "Timezone for the schedule expression" + default = "UTC" +}