From 68fbc47f96b6900d4827ee7c556420274f40006a Mon Sep 17 00:00:00 2001 From: Alberto Villa Date: Sat, 9 Dec 2017 11:14:30 +0100 Subject: [PATCH] First commit --- README.md | 24 ++++ downloader.py | 126 ++++++++++++++++++++ instapaper.py | 316 ++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 466 insertions(+) create mode 100644 README.md create mode 100644 downloader.py create mode 100755 instapaper.py diff --git a/README.md b/README.md new file mode 100644 index 0000000..ea427d1 --- /dev/null +++ b/README.md @@ -0,0 +1,24 @@ +# Highlights Downloader + +Simple application which downloads all the highlights stored in your Instapaper folders as Markdown files. + +The downloader creates a tree of folders inside `highlights`. There will be one folder for each one you have in Instapaper. The folder name will be the id of the folder in Instapaper. + +Inside every folder the downloader will create a Markdown file including the title of the bookmark, url reference and all the highlights (one paragraph per each of them). + +## Requirements +Install the following packages: +- `pip install httplib2` +- `pip install oauth2` + +## Usage + +1. Get a KEY and SECRET OAuth from [Instapaper](https://www.instapaper.com/main/request_oauth_consumer_token) +2. Modify the file `Credentials.ini` with your KEY, SECRET, LOGIN and PASSWORD +3. Call the app `python downloader.py` + +You can import then these Markdown files in your favourite Notes application e.g. [Bear](http://www.bear-writer.com/) + +## Credits + +Highlights Downloader makes use of a modified version of the [pyinstapaper library](https://github.com/mdorn/pyinstapaper) Python wrapper to the Instapaper API; thanks [Matt Dorn](https://github.com/mdorn)! diff --git a/downloader.py b/downloader.py new file mode 100644 index 0000000..3d8affd --- /dev/null +++ b/downloader.py @@ -0,0 +1,126 @@ +import datetime +import os +import ConfigParser + +from instapaper import Instapaper, Folder + +# Init instapaper with key, secret, login and password +def init(): + # Read credentials from Credentials.ini file + configParser = ConfigParser.RawConfigParser() + configParser.read('Credentials.ini') + + key = configParser.get('Instapaper', 'INSTAPAPER_KEY') + secret = configParser.get('Instapaper', 'INSTAPAPER_SECRET') + login = configParser.get('Login', 'INSTAPAPER_LOGIN') + password = configParser.get('Login', 'INSTAPAPER_PASSWORD') + + print key, secret, login, password + + # Create instance of Instapaper using the OAth credentials + instapaper = Instapaper(key, secret) + + # Login with user and password + instapaper.login(login, password) + + return instapaper + +# Function to change to highlights folder +def change_to_highlights_folder(): + # If there is no folder in the system with highlights then create it + if not os.path.exists('highlights'): + os.makedirs('highlights') + + # Change to the folder + os.chdir('highlights') + +# Change to folder using the folder_id +def change_to_folder(folder_id): + # Folder name = its folder_id + folder = str(folder_id) + + # If there is no folder in the system for this folder_id then create it + if not os.path.exists(folder): + os.makedirs(folder) + + # Change to the folder + os.chdir(folder) + +def get_list_of_existing_highlights(): + existing = [] + + # Get all .md files in current directory + for file in os.listdir('.'): + if file.endswith('.md'): + existing.append(int(os.path.splitext(file)[0])) + + return existing + +# Process bookmarks in one folder +def process_folder(folder): + # Show id and title of the folder + print folder.folder_id, folder.title + + change_to_folder(folder.folder_id) + + existing = get_list_of_existing_highlights() + + # Get bookmarks from the current folder using its folder_id + # TODO: identify which bookmarks have been processed and pass their ids + # as parameter to this function as 'have'. Current library does not allow + # this functionality, needs to be updated + bookmarks = instapaper.get_bookmarks(folder=folder.folder_id, have=existing, limit=500) + + process_bookmarks(bookmarks) + +# Process list of bookmarks +def process_bookmarks(bookmarks): + # Show number of bookmarks to be processed + print 'Number of bookmarks: ' + str(len(bookmarks)) + + for bookmark in bookmarks: + process_bookmark(bookmark) + +# Process the highlights of one bookmark +def process_bookmark(bookmark): + # Get the highlights + highlights = bookmark.get_highlights() + + # If there is any highlight + if len(highlights) > 0: + # Check if the bookmark has been already processed + # TODO: we would not need this if the 'have' parameter is used + # in the call to get_bookmarks + if not os.path.exists(str(bookmark.bookmark_id) + ".md"): + # Show that we have found a new bookmark with highlights + print "New highlight file: " + str(bookmark.bookmark_id) + ".md" + + # Create the file + new_file = open(str(bookmark.bookmark_id) + ".md", "w") + + # Add the title and reference url + new_file.write('# ' + bookmark.title.encode('utf-8') + '\n') + new_file.write('[Reference]' + '(' + bookmark.url.encode('utf-8') + ')\n\n') + + # Write each highlight to the file, adding a line between them + for highlight in highlights: + new_file.write(highlight.text.encode('utf-8') + '\n\n') + new_file.close() + +# ---------------------------------- +# Init Instapaper +instapaper = init() + +# Change to highlights folder +change_to_highlights_folder() + +# Get all folders +folders = instapaper.get_folders() + +# Process each folder +for folder in folders: + + process_folder(folder) + + # Change to the root folder + os.chdir('..') diff --git a/instapaper.py b/instapaper.py new file mode 100755 index 0000000..cb96efa --- /dev/null +++ b/instapaper.py @@ -0,0 +1,316 @@ +# -*- coding: utf-8 -*- + +from datetime import datetime + +import json +import logging +import time +import urlparse +from urllib import urlencode + +import oauth2 as oauth + +BASE_URL = 'https://www.instapaper.com' +API_VERSION = '1' +ACCESS_TOKEN = 'oauth/access_token' +LOGIN_URL = 'https://www.instapaper.com/user/login' +REQUEST_DELAY_SECS = 0.5 + +log = logging.getLogger(__name__) + + +class Instapaper(object): + '''Instapaper client class. + + :param oauth_key str: Instapaper OAuth consumer key + :param oauth_secret str: Instapaper OAuth consumer secret + ''' + + def __init__(self, oauth_key, oauth_secret): + self.consumer = oauth.Consumer(oauth_key, oauth_secret) + self.oauth_client = oauth.Client(self.consumer) + self.token = None + + def login(self, username, password): + '''Authenticate using XAuth variant of OAuth. + + :param str username: Username or email address for the relevant account + :param str password: Password for the account + ''' + response = self.request( + ACCESS_TOKEN, + { + 'x_auth_mode': 'client_auth', + 'x_auth_username': username, + 'x_auth_password': password + }, + returns_json=False + ) + token = dict(urlparse.parse_qsl(response['data'])) + self.token = oauth.Token( + token['oauth_token'], token['oauth_token_secret']) + self.oauth_client = oauth.Client(self.consumer, self.token) + + def request(self, path, params=None, returns_json=True, + method='POST', api_version=API_VERSION): + '''Process a request using the OAuth client's request method. + + :param str path: Path fragment to the API endpoint, e.g. "resource/ID" + :param dict params: Parameters to pass to request + :param str method: Optional HTTP method, normally POST for Instapaper + :param str api_version: Optional alternative API version + :returns: response headers and body + :retval: dict + ''' + time.sleep(REQUEST_DELAY_SECS) + full_path = '/'.join([BASE_URL, 'api/%s' % api_version, path]) + params = urlencode(params) if params else None + log.debug('URL: %s', full_path) + request_kwargs = {'method': method} + if params: + request_kwargs['body'] = params + response, content = self.oauth_client.request( + full_path, **request_kwargs) + log.debug('CONTENT: %s ...', content[:50]) + if returns_json: + try: + data = json.loads(content) + if isinstance(data, list) and len(data) == 1: + # ugly -- API always returns a list even when you expect + # only one item + if data[0]['type'] == 'error': + raise Exception('Instapaper error %d: %s' % ( + data[0]['error_code'], + data[0]['message']) + ) + # TODO: PyInstapaperException custom class? + except ValueError: + # Instapaper API can be unpredictable/inconsistent, e.g. + # bookmarks/get_text doesn't return JSON + data = content + else: + data = content + return { + 'response': response, + 'data': data + } + + def get_bookmarks(self, folder='unread', limit=10, have=[]): + """Return list of user's bookmarks. + + :param str folder: Optional. Possible values are unread (default), + starred, archive, or a folder_id value. + :param int limit: Optional. A number between 1 and 500, default 25. + :returns: List of user's bookmarks + :rtype: list + """ + path = 'bookmarks/list' + response = self.request(path, {'folder_id': folder, 'limit': limit, 'have': have}) + items = response['data'] + bookmarks = [] + for item in items: + if item.get('type') == 'error': + raise Exception(item.get('message')) + elif item.get('type') == 'bookmark': + bookmarks.append(Bookmark(self, **item)) + return bookmarks + + def get_folders(self): + """Return list of user's folders. + + :rtype: list + """ + path = 'folders/list' + response = self.request(path) + items = response['data'] + folders = [] + for item in items: + if item.get('type') == 'error': + raise Exception(item.get('message')) + elif item.get('type') == 'folder': + folders.append(Folder(self, **item)) + return folders + + +class InstapaperObject(object): + + '''Base class for Instapaper objects like Bookmark. + + :param client: instance of the OAuth client for making requests + :type client: ``oauth2.Client`` + :param dict data: key/value pairs of object attributes, e.g. title, etc. + ''' + + def __init__(self, client, **data): + self.client = client + for attrib in self.ATTRIBUTES: + val = data.get(attrib) + if hasattr(self, 'TIMESTAMP_ATTRS'): + if attrib in self.TIMESTAMP_ATTRS: + try: + val = datetime.fromtimestamp(int(val)) + except ValueError: + log.warn( + 'Could not cast %s for %s as datetime', + val, attrib + ) + setattr(self, attrib, val) + self.object_id = getattr(self, self.RESOURCE_ID_ATTRIBUTE) + for action in self.SIMPLE_ACTIONS: + setattr(self, action, lambda x: self._simple_action(x)) + instance_method = getattr(self, action) + instance_method.func_defaults = (action,) + + def add(self): + '''Save an object to Instapaper after instantiating it. + + Example:: + + folder = Folder(instapaper, title='stuff') + result = folder.add() + ''' + # TODO validation per object type + submit_attribs = {} + for attrib in self.ATTRIBUTES: + val = getattr(self, attrib, None) + if val: + submit_attribs[attrib] = val + path = '/'.join([self.RESOURCE, 'add']) + result = self.client.request(path, submit_attribs) + return result + + def _simple_action(self, action=None): + '''Issue a request for an API method whose only param is the obj ID. + + :param str action: The name of the action for the resource + :returns: Response from the API + :rtype: dict + ''' + if not action: + raise Exception('No simple action defined') + path = "/".join([self.RESOURCE, action]) + response = self.client.request( + path, {self.RESOURCE_ID_ATTRIBUTE: self.object_id} + ) + return response + + +class Bookmark(InstapaperObject): + + '''Object representing an Instapaper bookmark/article.''' + + RESOURCE = 'bookmarks' + RESOURCE_ID_ATTRIBUTE = 'bookmark_id' + # TODO: identify which fields to convert from timestamp to Python datetime + ATTRIBUTES = [ + 'bookmark_id', + 'title', + 'description', + 'hash', + 'url', + 'progress_timestamp', + 'time', + 'progress', + 'starred', + 'type', + 'private_source' + ] + TIMESTAMP_ATTRS = [ + 'progress_timestamp', + 'time' + ] + SIMPLE_ACTIONS = [ + 'delete', + 'star', + 'archive', + 'unarchive', + 'get_text' + ] + + def __str__(self): + return 'Bookmark %s: %s' % (self.object_id, self.title.encode('utf-8')) + + def get_highlights(self): + '''Get highlights for Bookmark instance. + + :return: list of ``Highlight`` objects + :rtype: list + ''' + # NOTE: all Instapaper API methods use POST except this one! + path = '/'.join([self.RESOURCE, str(self.object_id), 'highlights']) + response = self.client.request(path, method='GET', api_version='1.1') + items = response['data'] + highlights = [] + for item in items: + if item.get('type') == 'error': + raise Exception(item.get('message')) + elif item.get('type') == 'highlight': + highlights.append(Highlight(self, **item)) + return highlights + + +class Folder(InstapaperObject): + + '''Object representing an Instapaper folder.''' + + RESOURCE = 'folders' + RESOURCE_ID_ATTRIBUTE = 'folder_id' + ATTRIBUTES = [ + 'folder_id', + 'title', + 'display_title', + 'sync_to_mobile', + 'folder_id', + 'position', + 'type', + 'slug', + ] + SIMPLE_ACTIONS = [ + 'delete', + ] + + def __str__(self): + return 'Folder %s: %s' % (self.object_id, self.title) + + def set_order(self, folder_ids): + """Order the user's folders + + :param list folders: List of folder IDs in the desired order. + :returns: List Folder objects in the new order. + :rtype: list + """ + # TODO + raise NotImplementedError + + +class Highlight(InstapaperObject): + + '''Object representing an Instapaper highlight.''' + + RESOURCE = 'highlights' + RESOURCE_ID_ATTRIBUTE = 'highlight_id' + + ATTRIBUTES = [ + 'highlight_id', + 'text', + 'note', + 'time', + 'position', + 'bookmark_id', + 'type', + 'slug', + ] + TIMESTAMP_ATTRS = [ + 'time', + ] + SIMPLE_ACTIONS = [ + 'delete', + ] + + def __str__(self): + return 'Highlight %s for Article %s' % ( + self.object_id, self.bookmark_id) + + def create(self): + # TODO + raise NotImplementedError