Files
feedsummarizer/src/summarizer.py
Harry Reeder 19872d51cb Initial Commit
2025-06-20 13:53:46 +01:00

150 lines
4.2 KiB
Python

import os
from datetime import datetime, timedelta
from html.parser import HTMLParser
from io import StringIO
import requests
from aws_lambda_powertools import Logger
from aws_lambda_powertools.utilities import parameters
from aws_lambda_powertools.utilities.typing import LambdaContext
from langchain_core.messages import SystemMessage, HumanMessage
from langchain_aws import ChatBedrockConverse
from rss_parser import RSSParser
from slack_sdk.webhook import WebhookClient
from models import FeedItem, SlackMessage
# Configurable
MODEL = os.environ["BEDROCK_MODEL"]
MAXLEN_DESCRIPTION = int(os.environ.get("MAXLEN_DESCRIPTION", "2500"))
PARAM_SLACK_WEBHOOK = os.environ["PARAM_SLACK_WEBHOOK"]
USER_AGENT = os.environ.get("USER_AGENT", "feedsummarizer 0.1 +harry@reeder.dev")
# TODO: Make FEED_URL and SYSTEM_PROMPT configurable
FEED_URL = "https://aws.amazon.com/about-aws/whats-new/recent/feed/"
SYSTEM_PROMPT = """Using the following entries from the AWS "What's New" feed, create a once-per-week message to post to slack to keep people up to date with what they should know.
Ensure that anything which is in preview is clearly marked as such.
Categories should have relevant emoji, and individual entries should have a "Learn More" link to the article's page.
"""
LLM = ChatBedrockConverse(
model=MODEL,
temperature=0.5,
max_tokens=None,
)
SLACK_WEBHOOK_URL = parameters.get_parameter(PARAM_SLACK_WEBHOOK, decrypt=True)
log = Logger()
# From: https://stackoverflow.com/a/925630
class MLStripper(HTMLParser):
def __init__(self):
super().__init__()
self.reset()
self.strict = False
self.convert_charrefs = True
self.text = StringIO()
def handle_data(self, d):
self.text.write(d)
def get_data(self):
return self.text.getvalue()
def strip_tags(html: str) -> str:
s = MLStripper()
s.feed(html)
return s.get_data()
def get_feed_entries(feed_url: str, since: int) -> list[FeedItem]:
response = requests.get(feed_url, headers={"User-Agent": USER_AGENT})
response_text = response.text
log.debug(
"got feed",
extra={
"feed_url": feed_url,
"response": {
"status_code": response.status_code,
"headers": dict(response.headers),
},
},
)
feed = RSSParser.parse(response_text)
items = [
FeedItem(
title=it.title.content,
description=strip_tags(it.description.content)[:MAXLEN_DESCRIPTION],
pub_date=it.pub_date.content,
link=it.links[0].content,
)
for it in feed.channel.items
]
items = [it for it in items if it.timestamp > since]
oldest = min(items, key=lambda it: it.timestamp)
log.debug(
"filtered items",
extra={
"item_count": len(items),
"maxlen_description": MAXLEN_DESCRIPTION,
"oldest": oldest.pub_date,
},
)
return items
def get_summary_message(feed_items: list[FeedItem]) -> SlackMessage:
messages = [
SystemMessage(SYSTEM_PROMPT),
HumanMessage("\n".join([it.model_dump_json() for it in feed_items])),
]
model = LLM.with_structured_output(SlackMessage)
log.debug("invoking model", extra={"model": MODEL})
response = model.invoke(messages)
log.debug("got response")
return response
def send_to_slack(message: SlackMessage):
client = WebhookClient(SLACK_WEBHOOK_URL)
for category in message.categories:
slack_message = f"{category.emoji} *{category.title}*"
for entry in category.entries:
slack_message += f"\n{entry.headline} - <{entry.link}|Learn More>"
client.send(text=slack_message)
def main():
if SLACK_WEBHOOK_URL == "not-set":
raise ValueError(
f"No Slack Webhook URL is currently set in SSM at {os.environ['PARAM_SLACK_WEBHOOK']}"
)
now = datetime.now()
last_week = now - timedelta(weeks=1)
entries = get_feed_entries(FEED_URL, since=int(last_week.timestamp()))
message = get_summary_message(entries)
send_to_slack(message)
@log.inject_lambda_context
def lambda_handler(event: dict, context: LambdaContext):
main()
if __name__ == "__main__":
main()