mirror of
https://github.com/createfuture-cloud/feedsummarizer.git
synced 2025-12-13 09:02:24 +00:00
Initial Commit
This commit is contained in:
176
.gitignore
vendored
Normal file
176
.gitignore
vendored
Normal file
@@ -0,0 +1,176 @@
|
||||
# Created by https://www.toptal.com/developers/gitignore/api/python
|
||||
# Edit at https://www.toptal.com/developers/gitignore?templates=python
|
||||
|
||||
### Python ###
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*$py.class
|
||||
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
develop-eggs/
|
||||
dist/
|
||||
downloads/
|
||||
eggs/
|
||||
.eggs/
|
||||
lib/
|
||||
lib64/
|
||||
parts/
|
||||
sdist/
|
||||
var/
|
||||
wheels/
|
||||
share/python-wheels/
|
||||
*.egg-info/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
|
||||
# PyInstaller
|
||||
# Usually these files are written by a python script from a template
|
||||
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
||||
*.manifest
|
||||
*.spec
|
||||
|
||||
# Installer logs
|
||||
pip-log.txt
|
||||
pip-delete-this-directory.txt
|
||||
|
||||
# Unit test / coverage reports
|
||||
htmlcov/
|
||||
.tox/
|
||||
.nox/
|
||||
.coverage
|
||||
.coverage.*
|
||||
.cache
|
||||
nosetests.xml
|
||||
coverage.xml
|
||||
*.cover
|
||||
*.py,cover
|
||||
.hypothesis/
|
||||
.pytest_cache/
|
||||
cover/
|
||||
|
||||
# Translations
|
||||
*.mo
|
||||
*.pot
|
||||
|
||||
# Django stuff:
|
||||
*.log
|
||||
local_settings.py
|
||||
db.sqlite3
|
||||
db.sqlite3-journal
|
||||
|
||||
# Flask stuff:
|
||||
instance/
|
||||
.webassets-cache
|
||||
|
||||
# Scrapy stuff:
|
||||
.scrapy
|
||||
|
||||
# Sphinx documentation
|
||||
docs/_build/
|
||||
|
||||
# PyBuilder
|
||||
.pybuilder/
|
||||
target/
|
||||
|
||||
# Jupyter Notebook
|
||||
.ipynb_checkpoints
|
||||
|
||||
# IPython
|
||||
profile_default/
|
||||
ipython_config.py
|
||||
|
||||
# pyenv
|
||||
# For a library or package, you might want to ignore these files since the code is
|
||||
# intended to run in multiple environments; otherwise, check them in:
|
||||
# .python-version
|
||||
|
||||
# pipenv
|
||||
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
||||
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
||||
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
||||
# install all needed dependencies.
|
||||
#Pipfile.lock
|
||||
|
||||
# poetry
|
||||
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
||||
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
||||
# commonly ignored for libraries.
|
||||
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
||||
#poetry.lock
|
||||
|
||||
# pdm
|
||||
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
||||
#pdm.lock
|
||||
# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
|
||||
# in version control.
|
||||
# https://pdm.fming.dev/#use-with-ide
|
||||
.pdm.toml
|
||||
|
||||
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
||||
__pypackages__/
|
||||
|
||||
# Celery stuff
|
||||
celerybeat-schedule
|
||||
celerybeat.pid
|
||||
|
||||
# SageMath parsed files
|
||||
*.sage.py
|
||||
|
||||
# Environments
|
||||
.env
|
||||
.venv
|
||||
env/
|
||||
venv/
|
||||
ENV/
|
||||
env.bak/
|
||||
venv.bak/
|
||||
|
||||
# Spyder project settings
|
||||
.spyderproject
|
||||
.spyproject
|
||||
|
||||
# Rope project settings
|
||||
.ropeproject
|
||||
|
||||
# mkdocs documentation
|
||||
/site
|
||||
|
||||
# mypy
|
||||
.mypy_cache/
|
||||
.dmypy.json
|
||||
dmypy.json
|
||||
|
||||
# Pyre type checker
|
||||
.pyre/
|
||||
|
||||
# pytype static type analyzer
|
||||
.pytype/
|
||||
|
||||
# Cython debug symbols
|
||||
cython_debug/
|
||||
|
||||
# PyCharm
|
||||
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
||||
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
||||
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
||||
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
||||
#.idea/
|
||||
|
||||
### Python Patch ###
|
||||
# Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration
|
||||
poetry.toml
|
||||
|
||||
# ruff
|
||||
.ruff_cache/
|
||||
|
||||
# LSP config files
|
||||
pyrightconfig.json
|
||||
|
||||
# End of https://www.toptal.com/developers/gitignore/api/python
|
||||
102
main.tf
Normal file
102
main.tf
Normal file
@@ -0,0 +1,102 @@
|
||||
terraform {
|
||||
# We're using the value_wo featuers that exist in TF 1.11 or later
|
||||
required_version = ">= 1.11.0"
|
||||
|
||||
required_providers {
|
||||
aws = {
|
||||
source = "hashicorp/aws"
|
||||
version = "~> 5.0"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
data "aws_caller_identity" "current" {}
|
||||
|
||||
module "function" {
|
||||
source = "terraform-aws-modules/lambda/aws"
|
||||
version = "7.21.0"
|
||||
|
||||
function_name = "feedsummarizer-${var.environment}"
|
||||
description = "Slack RSS Feed Summarizer"
|
||||
runtime = "python3.13"
|
||||
handler = "summarizer.lambda_handler"
|
||||
timeout = 300 # 5 minutes
|
||||
|
||||
build_in_docker = true
|
||||
architectures = ["arm64"]
|
||||
|
||||
source_path = [{
|
||||
path = "${path.module}/src"
|
||||
pip_requirements = true
|
||||
dockerize_pip = true
|
||||
}]
|
||||
|
||||
environment_variables = {
|
||||
BEDROCK_MODEL = var.bedrock_model
|
||||
|
||||
PARAM_SLACK_WEBHOOK = aws_ssm_parameter.webhook_url.name
|
||||
|
||||
POWERTOOLS_LOG_LEVEL = var.log_level
|
||||
POWERTOOLS_SERVICE_NAME = "feedsummarizer"
|
||||
}
|
||||
|
||||
attach_policy_statements = true
|
||||
policy_statements = {
|
||||
AllowBedrock = {
|
||||
effect = "Allow"
|
||||
actions = ["bedrock:InvokeModel"]
|
||||
resources = ["*"] # TODO: Lock down to just the selected model
|
||||
}
|
||||
|
||||
AllowSsmParameters = {
|
||||
effect = "Allow"
|
||||
actions = ["ssm:GetParameter"]
|
||||
resources = [
|
||||
aws_ssm_parameter.webhook_url.arn,
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
resource "aws_ssm_parameter" "webhook_url" {
|
||||
name = "/feedsummarizer/${var.environment}/slack-webhook-url"
|
||||
type = "SecureString"
|
||||
|
||||
# Seed value
|
||||
value_wo = "not-set"
|
||||
value_wo_version = 1
|
||||
}
|
||||
|
||||
module "scheduler_role" {
|
||||
source = "terraform-aws-modules/iam/aws//modules/iam-assumable-role"
|
||||
version = "5.58.0"
|
||||
|
||||
trusted_role_services = ["scheduler.amazonaws.com"]
|
||||
|
||||
create_role = true
|
||||
role_requires_mfa = false # This is a service role, not for humans
|
||||
role_name = "feedsummarizer-${var.environment}-scheduler"
|
||||
|
||||
inline_policy_statements = [{
|
||||
sid = "AllowTriggerLambda"
|
||||
actions = [
|
||||
"lambda:InvokeFunction"
|
||||
]
|
||||
resources = [module.function.lambda_function_arn]
|
||||
}]
|
||||
}
|
||||
|
||||
resource "aws_scheduler_schedule" "this" {
|
||||
name = "feedsummarizer-${var.environment}"
|
||||
schedule_expression = var.schedule_expression
|
||||
schedule_expression_timezone = var.schedule_expression_timezone
|
||||
|
||||
flexible_time_window {
|
||||
mode = "OFF"
|
||||
}
|
||||
|
||||
target {
|
||||
arn = module.function.lambda_function_arn
|
||||
role_arn = module.scheduler_role.iam_role_arn
|
||||
}
|
||||
}
|
||||
34
src/models.py
Normal file
34
src/models.py
Normal file
@@ -0,0 +1,34 @@
|
||||
from datetime import datetime
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
|
||||
class MessageCategoryEntry(BaseModel):
|
||||
headline: str = Field(
|
||||
description="Blog-post headline/title/summary for this update. This can be up to two sentences long and should be reasonably descriptive."
|
||||
)
|
||||
link: str = Field(description="Link to the source for this update")
|
||||
|
||||
|
||||
class MessageCategory(BaseModel):
|
||||
emoji: str = Field(description="Slack-Coded Emoji for this category")
|
||||
title: str = Field(description="Title for this category of updates")
|
||||
entries: list[MessageCategoryEntry] = Field(
|
||||
description="Individual entries within this category"
|
||||
)
|
||||
|
||||
|
||||
class SlackMessage(BaseModel):
|
||||
categories: list[MessageCategory]
|
||||
|
||||
|
||||
class FeedItem(BaseModel):
|
||||
title: str
|
||||
description: str
|
||||
pub_date: str
|
||||
link: str
|
||||
|
||||
@property
|
||||
def timestamp(self):
|
||||
parsed = datetime.strptime(self.pub_date, "%a, %d %b %Y %H:%M:%S %Z")
|
||||
return int(parsed.timestamp())
|
||||
7
src/requirements.txt
Normal file
7
src/requirements.txt
Normal file
@@ -0,0 +1,7 @@
|
||||
aws_lambda_powertools~=3.14
|
||||
boto3~=1.38
|
||||
langchain[aws]~=0.3.25
|
||||
pydantic~=2.11
|
||||
requests~=2.32
|
||||
rss-parser~=2.1
|
||||
slack_sdk~=3.35
|
||||
149
src/summarizer.py
Normal file
149
src/summarizer.py
Normal file
@@ -0,0 +1,149 @@
|
||||
import os
|
||||
from datetime import datetime, timedelta
|
||||
from html.parser import HTMLParser
|
||||
from io import StringIO
|
||||
|
||||
import requests
|
||||
from aws_lambda_powertools import Logger
|
||||
from aws_lambda_powertools.utilities import parameters
|
||||
from aws_lambda_powertools.utilities.typing import LambdaContext
|
||||
from langchain_core.messages import SystemMessage, HumanMessage
|
||||
from langchain_aws import ChatBedrockConverse
|
||||
from rss_parser import RSSParser
|
||||
from slack_sdk.webhook import WebhookClient
|
||||
|
||||
from models import FeedItem, SlackMessage
|
||||
|
||||
# Configurable
|
||||
MODEL = os.environ["BEDROCK_MODEL"]
|
||||
MAXLEN_DESCRIPTION = int(os.environ.get("MAXLEN_DESCRIPTION", "2500"))
|
||||
PARAM_SLACK_WEBHOOK = os.environ["PARAM_SLACK_WEBHOOK"]
|
||||
USER_AGENT = os.environ.get("USER_AGENT", "feedsummarizer 0.1 +harry@reeder.dev")
|
||||
|
||||
# TODO: Make FEED_URL and SYSTEM_PROMPT configurable
|
||||
FEED_URL = "https://aws.amazon.com/about-aws/whats-new/recent/feed/"
|
||||
SYSTEM_PROMPT = """Using the following entries from the AWS "What's New" feed, create a once-per-week message to post to slack to keep people up to date with what they should know.
|
||||
|
||||
Ensure that anything which is in preview is clearly marked as such.
|
||||
|
||||
Categories should have relevant emoji, and individual entries should have a "Learn More" link to the article's page.
|
||||
"""
|
||||
|
||||
LLM = ChatBedrockConverse(
|
||||
model=MODEL,
|
||||
temperature=0.5,
|
||||
max_tokens=None,
|
||||
)
|
||||
|
||||
SLACK_WEBHOOK_URL = parameters.get_parameter(PARAM_SLACK_WEBHOOK, decrypt=True)
|
||||
|
||||
log = Logger()
|
||||
|
||||
|
||||
# From: https://stackoverflow.com/a/925630
|
||||
class MLStripper(HTMLParser):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.reset()
|
||||
self.strict = False
|
||||
self.convert_charrefs = True
|
||||
self.text = StringIO()
|
||||
|
||||
def handle_data(self, d):
|
||||
self.text.write(d)
|
||||
|
||||
def get_data(self):
|
||||
return self.text.getvalue()
|
||||
|
||||
|
||||
def strip_tags(html: str) -> str:
|
||||
s = MLStripper()
|
||||
s.feed(html)
|
||||
return s.get_data()
|
||||
|
||||
|
||||
def get_feed_entries(feed_url: str, since: int) -> list[FeedItem]:
|
||||
response = requests.get(feed_url, headers={"User-Agent": USER_AGENT})
|
||||
response_text = response.text
|
||||
log.debug(
|
||||
"got feed",
|
||||
extra={
|
||||
"feed_url": feed_url,
|
||||
"response": {
|
||||
"status_code": response.status_code,
|
||||
"headers": dict(response.headers),
|
||||
},
|
||||
},
|
||||
)
|
||||
feed = RSSParser.parse(response_text)
|
||||
|
||||
items = [
|
||||
FeedItem(
|
||||
title=it.title.content,
|
||||
description=strip_tags(it.description.content)[:MAXLEN_DESCRIPTION],
|
||||
pub_date=it.pub_date.content,
|
||||
link=it.links[0].content,
|
||||
)
|
||||
for it in feed.channel.items
|
||||
]
|
||||
|
||||
items = [it for it in items if it.timestamp > since]
|
||||
oldest = min(items, key=lambda it: it.timestamp)
|
||||
|
||||
log.debug(
|
||||
"filtered items",
|
||||
extra={
|
||||
"item_count": len(items),
|
||||
"maxlen_description": MAXLEN_DESCRIPTION,
|
||||
"oldest": oldest.pub_date,
|
||||
},
|
||||
)
|
||||
|
||||
return items
|
||||
|
||||
|
||||
def get_summary_message(feed_items: list[FeedItem]) -> SlackMessage:
|
||||
messages = [
|
||||
SystemMessage(SYSTEM_PROMPT),
|
||||
HumanMessage("\n".join([it.model_dump_json() for it in feed_items])),
|
||||
]
|
||||
model = LLM.with_structured_output(SlackMessage)
|
||||
log.debug("invoking model", extra={"model": MODEL})
|
||||
response = model.invoke(messages)
|
||||
log.debug("got response")
|
||||
|
||||
return response
|
||||
|
||||
|
||||
def send_to_slack(message: SlackMessage):
|
||||
client = WebhookClient(SLACK_WEBHOOK_URL)
|
||||
|
||||
for category in message.categories:
|
||||
slack_message = f"{category.emoji} *{category.title}*"
|
||||
for entry in category.entries:
|
||||
slack_message += f"\n • {entry.headline} - <{entry.link}|Learn More>"
|
||||
|
||||
client.send(text=slack_message)
|
||||
|
||||
|
||||
def main():
|
||||
if SLACK_WEBHOOK_URL == "not-set":
|
||||
raise ValueError(
|
||||
f"No Slack Webhook URL is currently set in SSM at {os.environ['PARAM_SLACK_WEBHOOK']}"
|
||||
)
|
||||
|
||||
now = datetime.now()
|
||||
last_week = now - timedelta(weeks=1)
|
||||
|
||||
entries = get_feed_entries(FEED_URL, since=int(last_week.timestamp()))
|
||||
message = get_summary_message(entries)
|
||||
send_to_slack(message)
|
||||
|
||||
|
||||
@log.inject_lambda_context
|
||||
def lambda_handler(event: dict, context: LambdaContext):
|
||||
main()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
29
variables.tf
Normal file
29
variables.tf
Normal file
@@ -0,0 +1,29 @@
|
||||
variable "environment" {
|
||||
type = string
|
||||
default = "prod"
|
||||
description = "Environment name to suffix to resources"
|
||||
}
|
||||
|
||||
variable "log_level" {
|
||||
type = string
|
||||
default = "INFO"
|
||||
description = "Lambda Log Level"
|
||||
}
|
||||
|
||||
variable "bedrock_model" {
|
||||
default = "eu.anthropic.claude-sonnet-4-20250514-v1:0"
|
||||
type = string
|
||||
description = "Which bedrock model ID to use"
|
||||
}
|
||||
|
||||
variable "schedule_expression" {
|
||||
type = string
|
||||
description = "AWS EventBridge Scheduler Expression for when to run"
|
||||
default = "cron(30 8 ? * FRI *)"
|
||||
}
|
||||
|
||||
variable "schedule_expression_timezone" {
|
||||
type = string
|
||||
description = "Timezone for the schedule expression"
|
||||
default = "UTC"
|
||||
}
|
||||
Reference in New Issue
Block a user