diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 61ab675..e818a7e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,5 +1,4 @@ name: Run Pre-recorded Tests - on: pull_request: branches: @@ -7,7 +6,6 @@ on: push: branches: - master - jobs: run-tests: runs-on: ubuntu-latest @@ -16,41 +14,27 @@ jobs: python-version: ["3.10", "3.11"] steps: - name: Check out code - uses: actions/checkout@v2 - + uses: actions/checkout@v6 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - pip install vcrpy pytest==7.4.2 requests pytest-mock python-documentcloud pytest-xdist pytest-recording python-squarelet - + run: pip install -e ".[test]" - name: Run pre-recorded tests - run: | - make test - working-directory: . - + run: make test pylint-and-black: runs-on: ubuntu-latest steps: - name: Check out code - uses: actions/checkout@v2 - - - name: Set up Python 3.8 - uses: actions/setup-python@v4 + uses: actions/checkout@v6 + - name: Set up Python 3.11 + uses: actions/setup-python@v6 with: - python-version: 3.8 - - - name: Install dependencies for imports - run: | - pip install python-dateutil requests urllib3 fastjsonschema ratelimit listcrunch pyyaml pytest vcrpy python-squarelet - - - name: Install pylint and black - run: | - pip install pylint black - + python-version: "3.11" + - name: Install dependencies + run: pip install -e ".[dev,test]" - name: Run pylint and black on ./documentcloud and ./tests run: | - pylint ./documentcloud ./tests; black --check ./documentcloud ./tests + pylint ./documentcloud ./tests + black --check ./documentcloud ./tests \ No newline at end of file diff --git a/.gitignore b/.gitignore index e2fec16..b52fe85 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,4 @@ _build/ Pipfile Pipfile.lock .env +.DS_Store diff --git a/.isort.cfg b/.isort.cfg index 888b42d..b5a3b30 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -3,7 +3,6 @@ line_length=88 force_to_top= skip= skip_glob= -not_skip=__init__.py known_future_library=__future__ known_first_party=documentcloud indent=' ' diff --git a/.pylintrc b/.pylintrc index d75dfc4..eb60c8c 100644 --- a/.pylintrc +++ b/.pylintrc @@ -3,7 +3,4 @@ max-line-length=88 good-names=i,x1,x2,y1,y2,id [MESSAGES CONTROL] -disable=missing-docstring,too-many-ancestors,too-few-public-methods,no-else-return,no-member,attribute-defined-outside-init,similarities,import-outside-toplevel,cyclic-import,no-member,no-else-raise,too-many-instance-attributes,too-many-arguments,ungrouped-imports,useless-object-inheritance,no-else-continue - -[DESIGN] -max-positional-arguments=10 \ No newline at end of file +disable=missing-docstring,too-many-ancestors,too-few-public-methods,no-else-return,no-member,attribute-defined-outside-init,similarities,import-outside-toplevel,cyclic-import,no-member,no-else-raise,too-many-instance-attributes,too-many-arguments,too-many-positional-arguments,ungrouped-imports,useless-object-inheritance,no-else-continue diff --git a/Makefile b/Makefile index 966dc8a..b7a8ac1 100644 --- a/Makefile +++ b/Makefile @@ -41,10 +41,10 @@ coverage: check: pylint documentcloud black documentcloud - isort -rc documentcloud + isort documentcloud pylint tests black tests - isort -rc tests + isort tests # release a new version of the package to PyPI ship: diff --git a/docs/changelog.rst b/docs/changelog.rst index 1844ea8..39939f0 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,6 +1,22 @@ Changelog --------- +4.8.1 +~~~~~ +* Fixes coordinates for annotations. Credit: @synapticlee + +4.7.0 +~~~~~ +* Added burst-based sane rate limits to several endpoints. + +4.6.0 +~~~~~ +* Added load_run_data and store_run_data on the Add-On class to access AddOn run data. + +4.5.0 +~~~~~ +* Added OCR handling to upload and process methods. + 4.4.1 ~~~~~ * Fixes access to xlarge images. diff --git a/docs/conf.py b/docs/conf.py index 16adc2c..faf0f84 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -48,16 +48,16 @@ # General information about the project. project = "documentcloud" -copyright = "2025, MuckRock Foundation" +copyright = "2026, MuckRock Foundation" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = "4.5" +version = "4.8" # The full version, including alpha/beta/rc tags. -release = "4.5.0" +release = "4.8.1" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/docs/documents.rst b/docs/documents.rst index 01721b2..f021359 100644 --- a/docs/documents.rst +++ b/docs/documents.rst @@ -89,6 +89,13 @@ DocumentClient ``original_extension`` to the extension of the file type, such as ``docx`` or ``jpg``. + .. note:: + If you are looking to upload large sets of documents, consider using + the `batch upload script `_, + which is optimized for bulk uploads with built-in retry and error handling. + We have battle-tested the script on our CIA CREST Database which included + almost a million documents. We have `video guides `_ + on how to get started with the script on macOS, Windows, and Linux. .. method:: upload_directory(path, handle_errors=False, extensions=".pdf" **kwargs) diff --git a/docs/gettingstarted.rst b/docs/gettingstarted.rst index 4137a4d..c8f8ec3 100644 --- a/docs/gettingstarted.rst +++ b/docs/gettingstarted.rst @@ -87,6 +87,16 @@ You can also provide URLs that link to PDFs, if that's the kind of thing you're >>> client.documents.upload("http://ord.legistar.com/Chicago/attachments/e3a0cbcb-044d-4ec3-9848-23c5692b1943.pdf") + + +Uploading large sets of documents +---------------------------------- + +If you are looking to upload large sets of documents using the API, we *strongly* recommend using our +`batch upload script `_, which is optimized for bulk uploads and handles retries, +rate limiting, and error logging automatically. We have battle-tested the script on our CIA Crest Database which included almost a million documents. We have `video guides `_ on how to get started with the script on macOS, Windows and Linux. + + Uploading a document that is not a PDF ------------------------------------------------- diff --git a/documentcloud/addon.py b/documentcloud/addon.py index 8053661..4b0e14e 100644 --- a/documentcloud/addon.py +++ b/documentcloud/addon.py @@ -182,6 +182,26 @@ def upload_file(self, file): f"addon_runs/{self.id}/", json={"file_name": file_name} ) + def load_run_data(self): + "Load persistent data from this run" + if not self.id: + return {} + + response = self.client.get(f"addon_runs/{self.id}/") + response.raise_for_status() + return response.json().get("data", {}) + + def store_run_data(self, data): + "Store persistent data for this run" + if not self.id: + print("Run ID not set. Try again later or check if something went wrong.") + return None + + if not isinstance(data, dict): + raise TypeError("Invalid data") + + return self.client.patch(f"addon_runs/{self.id}/", json={"data": data}) + def load_event_data(self): """Load persistent data for this event""" if not self.event_id: diff --git a/documentcloud/annotations.py b/documentcloud/annotations.py index 01b1457..c02290d 100644 --- a/documentcloud/annotations.py +++ b/documentcloud/annotations.py @@ -79,7 +79,7 @@ def create( x2=None, y2=None, ): - coords = [x1, y2, x2, y2] + coords = [x1, y1, x2, y2] if not (all(c is None for c in coords) or all(c is not None for c in coords)): raise ValueError( "x1, y2, x2, y2 must either all be None or all be not None" diff --git a/documentcloud/client.py b/documentcloud/client.py index 1acee53..04ae838 100644 --- a/documentcloud/client.py +++ b/documentcloud/client.py @@ -1,12 +1,12 @@ -# Import SquareletClient from python-squarelet # Standard Library import logging +import time # Third Party +import token_bucket from squarelet import SquareletClient # Local -# Local Imports from .documents import DocumentClient from .organizations import OrganizationClient from .projects import ProjectClient @@ -14,6 +14,22 @@ logger = logging.getLogger("documentcloud") +# Per-endpoint rate limits applied on top of the global squarelet limit. +# Format: (method, url_pattern, rate_per_second, capacity) +# +# Endpoint Rate Burst Notes +# -------- ---- ----- ----- +# GET documents/search 15/min 50 +# POST documents/ 12/min 100 25 docs/bulk call = up to 300 docs/min +# PUT documents/ 12/min 100 25 docs/bulk call = up to 300 docs/min +# GET files/ 15/min 100 PDFs, full text, and other private assets +ENDPOINT_RATE_LIMITS = [ + ("GET", "documents/search", 15 / 60, 50), + ("POST", "documents/", 12 / 60, 100), + ("PUT", "documents/", 12 / 60, 100), + ("GET", "files/", 15 / 60, 100), +] + class DocumentCloud(SquareletClient): """ @@ -51,8 +67,34 @@ def __init__( else: logger.addHandler(logging.NullHandler()) + # Build per-endpoint token bucket rate limiters + storage = token_bucket.MemoryStorage() + self._endpoint_limiters = [ + ( + pattern_method, + pattern, + token_bucket.Limiter(rate=rate, capacity=capacity, storage=storage), + f"{pattern_method}:{pattern}", + ) + for pattern_method, pattern, rate, capacity in ENDPOINT_RATE_LIMITS + ] + # Initialize the sub-clients using SquareletClient self.documents = DocumentClient(self) self.projects = ProjectClient(self) self.users = UserClient(self) self.organizations = OrganizationClient(self) + + def request(self, method, url, raise_error=True, **kwargs): + for pattern_method, pattern, limiter, bucket_key in self._endpoint_limiters: + if pattern_method.upper() == method.upper() and pattern in url: + if not limiter.consume(bucket_key): + logger.warning( + "Rate limit reached for %s %s, throttling...", + method.upper(), + pattern, + ) + while not limiter.consume(bucket_key): + time.sleep(0.1) + return super().request(method, url, raise_error=raise_error, **kwargs) + return super().request(method, url, raise_error=raise_error, **kwargs) diff --git a/documentcloud/documents.py b/documentcloud/documents.py index f968cac..0a7dd65 100644 --- a/documentcloud/documents.py +++ b/documentcloud/documents.py @@ -7,10 +7,13 @@ import logging import os import re +import time import warnings from functools import partial +from urllib.parse import urlparse # Third Party +import token_bucket from requests.exceptions import RequestException # Local @@ -23,15 +26,12 @@ from .toolbox import grouper, is_url, merge_dicts, requests_retry_session from .users import User -try: - from urllib.parse import urlparse -except ImportError: - from urlparse import urlparse - logger = logging.getLogger("documentcloud") IMAGE_SIZES = ["thumbnail", "small", "normal", "large", "xlarge"] +DEFAULT_USER_AGENT = "python-documentcloud" + class Document(BaseAPIObject): """A single DocumentCloud document""" @@ -168,12 +168,17 @@ def _get_url(self, url, fmt=None): if base_netloc == url_netloc: # if the url host is the same as the base api host, - # sent the request with the client in order to include + # send the request with the client in order to include # authentication credentials response = self._client.get(url, full_url=True) else: - response = requests_retry_session().get( - url, headers={"User-Agent": "python-documentcloud2"} + response = self._client.documents.asset_get( + url, + headers={ + "User-Agent": self._client.session.headers.get( + "User-Agent", DEFAULT_USER_AGENT + ) + }, ) if fmt == "text": return response.content.decode("utf8") @@ -250,6 +255,26 @@ class DocumentClient(BaseAPIClient): api_path = "documents" resource = Document + def __init__(self, client): + super().__init__(client) + # Rate limit for public document asset fetches (S3-hosted). + # Private document assets go through the API client and are limited there. + # Token bucket: burst of 100, sustained at 15/min (0.25/sec). + storage = token_bucket.MemoryStorage() + self._asset_limiter = token_bucket.Limiter( + rate=15 / 60, + capacity=100, + storage=storage, + ) + self._asset_session = requests_retry_session() + + def asset_get(self, url, **kwargs): + if not self._asset_limiter.consume("asset"): + logger.warning("Rate limit reached for asset fetch, throttling...") + while not self._asset_limiter.consume("asset"): + time.sleep(0.1) + return self._asset_session.get(url, **kwargs) + def search(self, query, **params): """Return documents matching a search query""" diff --git a/documentcloud/exceptions.py b/documentcloud/exceptions.py index b26fe9a..ded0fca 100644 --- a/documentcloud/exceptions.py +++ b/documentcloud/exceptions.py @@ -2,11 +2,14 @@ Custom exceptions for python-documentcloud """ +# Third Party # pylint: disable=unused-import # Import exceptions from python-squarelet -from squarelet.exceptions import SquareletError as DocumentCloudError -from squarelet.exceptions import DuplicateObjectError -from squarelet.exceptions import CredentialsFailedError -from squarelet.exceptions import APIError -from squarelet.exceptions import DoesNotExistError -from squarelet.exceptions import MultipleObjectsReturnedError +from squarelet.exceptions import ( + APIError, + CredentialsFailedError, + DoesNotExistError, + DuplicateObjectError, + MultipleObjectsReturnedError, + SquareletError as DocumentCloudError, +) diff --git a/setup.py b/setup.py index 9c15513..e30b375 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name="python-documentcloud", - version="4.5.0", + version="4.8.1", description="A simple Python wrapper for the DocumentCloud API", author="Mitchell Kotler", author_email="mitch@muckrock.com", @@ -27,6 +27,7 @@ "pyyaml", "fastjsonschema", "python-squarelet", + "token-bucket", ), extras_require={ "dev": [ @@ -40,6 +41,7 @@ "test": [ "pytest", "pytest-mock", + "pytest-xdist", "pytest-recording", "vcrpy", ], diff --git a/tests/README.md b/tests/README.md index b0ca885..f2da4c3 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,21 +1,21 @@ -This library interacts with the API for DocumentCloud. Running the test suite +This library interacts with the API for DocumentCloud. Running the test suite against the live server provides many challenges - it is slow, will not work without an internet connection, and could give false failures for intermitent -network failures. To resolve these issues we use VCR.py -(https://github.com/kevin1024/vcrpy) to record the HTTP requests. When new +network failures. To resolve these issues we use VCR.py +(https://github.com/kevin1024/vcrpy) to record the HTTP requests. When new HTTP requests are needed for the tests, they are recorded against a -localinstance of the server. Then the tests can be run against the +localinstance of the server. Then the tests can be run against the pre-recorded responses, quickly and without interacting with any other software. To record the HTTP requests, you must have a local dev environment of Squarelet -and DocumentCloud running. You can find instructions for those here: +and DocumentCloud running. You can find instructions for those here: https://github.com/MuckRock/squarelet and -https://github.com/MuckRock/DocumentCloud. +https://github.com/MuckRock/DocumentCloud. You should create a test user locally, with the username `test-user` and password `test-password`. -There are some tests which require the access and refresh tokens to be expired. To accomodate this, those tests are expected to be run with the local Squarelet instance configured with very short lifetimes for those tokens. You should record the regular tests, change the settings, run the short tests, then change the settings back. The settings to change are located in `config/settings/base.py` in the Squarelet code base. Find the follow lines and uncomment the second two: +There are some tests which require the access and refresh tokens to be expired. To accomodate this, those tests are expected to be run with the local Squarelet instance configured with very short lifetimes for those tokens. You should record the regular tests, change the settings, run the short tests, then change the settings back. The settings to change are located in `config/settings/base.py` in the Squarelet code base. Find the follow lines and uncomment the second two: ``` # These are used for testing token expiration @@ -23,7 +23,7 @@ There are some tests which require the access and refresh tokens to be expired. # "REFRESH_TOKEN_LIFETIME": timedelta(seconds=5), ``` -There is a Makefile included to help run the tests. The following commands are available: +There is a Makefile included to help run the tests. The following commands are available: `test-clean` - This will clean all of the pre-recorded requests for the non-short tests @@ -33,9 +33,9 @@ There is a Makefile included to help run the tests. The following commands are `test-create-short` - This will clean the short tests and then run all of them and record the HTTP requests. -`test` - run all tests using the pre-recorded HTTP requests. If an HTTP request is missing, it will fail. +`test` - run all tests using the pre-recorded HTTP requests. If an HTTP request is missing, it will fail. -`test-dev` - run all tests using the pre-recorded HTTP requests. If an HTTP request is missing, record it. +`test-dev` - run all tests using the pre-recorded HTTP requests. If an HTTP request is missing, record it. `tox` - run all tests under multiple Python versions using tox. @@ -45,38 +45,40 @@ There is a Makefile included to help run the tests. The following commands are `ship` - Release a new version of the library on PyPI. -A normal workflow would be to use `test-create` to create the intial saved requests, or if you want to re-record all of them for some reason. You would then change the short settings as described above, and run `test-create-short`. Running `test` should now pass while making no actual HTTP requests. If you add a new test with a new request, you can run `test-dev` to record just the new request while leaving the existing ones in place. The saved requests should be checked in to git. +A normal workflow would be to use `test-create` to create the intial saved requests, or if you want to re-record all of them for some reason. You would then change the short settings as described above, and run `test-create-short`. Running `test` should now pass while making no actual HTTP requests. If you add a new test with a new request, you can run `test-dev` to record just the new request while leaving the existing ones in place. The saved requests should be checked in to git. ## Troubleshooting -### Token Errors +### Token Errors + If you receive a lot of errors that are 405's with E requests.exceptions.HTTPError: 405 Client Error: Method Not Allowed for url: https://dev.squarelet.com/api/token/ -You need to change the BASE_URI and AUTH_URI to https. +You need to change the BASE_URI and AUTH_URI to https. ### SSL Errors -If you receive a bunch of 500/SSL errors when running the tests, it is likely that your local dev environment doesn't have access to the necessary certificates to authenticate with your local DocumentCloud environment. -You can copy the .PEM/.CRT file that is inside the docker container to your local environment and pass this file in so make the SSL errors go away. -To resolve this you will want to have your local DocumentCloud environment running, including the Django container. You can find the container ID of a running container by running -```docker ps``` +If you receive a bunch of 500/SSL errors when running the tests, it is likely that your local dev environment doesn't have access to the necessary certificates to authenticate with your local DocumentCloud environment. +You can copy the .PEM/.CRT file that is inside the docker container to your local environment and pass this file in so make the SSL errors go away. + +To resolve this you will want to have your local DocumentCloud environment running, including the Django container. You can find the container ID of a running container by running +`docker ps` Then, to retrieve the certificate run: -```docker cp container_id_here:/etc/ssl/certs/ca-certificates.crt ~``` -Substitute ~ with the location where you would like to copy the certificate file, as ~ is the home directory. +`docker cp container_id_here:/etc/ssl/certs/ca-certificates.crt ~` +Substitute ~ with the location where you would like to copy the certificate file, as ~ is the home directory. -You can then run the full test-suite by re-recording results and passing in the necessary certificate like so: -```REQUESTS_CA_BUNDLE=/path/to/ca-certificates.crt make test-create``` -/path/to should be replaced by the actual location. +You can then run the full test-suite by re-recording results and passing in the necessary certificate like so: +`REQUESTS_CA_BUNDLE=/path/to/ca-certificates.crt make test-create` +/path/to should be replaced by the actual location. ### Assertion Errors -If you get a failure for the contributor method, it is because you need to set a full name for the test user within Squarelet. +If you get a failure for the contributor method, it is because you need to set a full name for the test user within Squarelet. If you receive the following failure: -```assert len(list(all_documents)) > len(list(my_documents.results))``` -It is because you need to have another user created on your local dev environment on Squarelet, have them verified, and have them upload at least one document. -This tests asserts that the total sum of documents in your local dev environment is larger than those owned by you. This wouldn't hold true if your test user was the only user who has uploaded a document. +`assert len(list(all_documents)) > len(list(my_documents.results))` +It is because you need to have another user created on your local dev environment on Squarelet, have them verified, and have them upload at least one document. +This tests asserts that the total sum of documents in your local dev environment is larger than those owned by you. This wouldn't hold true if your test user was the only user who has uploaded a document. If you receive this similar assertion failure: -```assert len(all_projects.results) > len(my_projects.results)``` -You will need to have that other user create a project as well. This is to pass this assertion. +`assert len(all_projects.results) > len(my_projects.results)` +You will need to have that other user create a project as well. This is to pass this assertion. diff --git a/tests/cassettes/test_addon/TestRunDataVCR.test_load_run_data_returns_dict.yaml b/tests/cassettes/test_addon/TestRunDataVCR.test_load_run_data_returns_dict.yaml new file mode 100644 index 0000000..0dc49d0 --- /dev/null +++ b/tests/cassettes/test_addon/TestRunDataVCR.test_load_run_data_returns_dict.yaml @@ -0,0 +1,209 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Authorization: + - Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJ0b2tlbl90eXBlIjoiYWNjZXNzIiwiZXhwIjoxNzU4NzMwNDI5LCJpYXQiOjE3NTg3MzAxMjksImp0aSI6IjEyMzE1OGFkOWExYjQ3YTE5ZDdhMjdlYjMwNTdmMzRkIiwidXNlcl9pZCI6ImY2MDI5ZThhLWQ4YTItNDE4OC04YTIxLWNkMDczYTAyZWU1YyIsImF1ZCI6WyJzcXVhcmVsZXQiLCJtdWNrcm9jayIsImRvY3VtZW50Y2xvdWQiXSwiaXNzIjpbInNxdWFyZWxldCJdfQ.LduwXStVTHJGEoNw9eO-xQ0OVLF6b-kaeq-qXOCGJ06mc6JygXWOE7D9NnV7m-PzdtTKCGyQtOCe1RvU51ho5yFNJUwlLcYlmABbOQnvWOuuyqYvzoGXwULm99lV8KWW2Aetmldo_pUU3LY0VJS4Qe13ZaussP1Emsr68_uvy99FU8Xkm_jQqt4qoUDWuj-IeXkVj024Mv70VrCOBiTs_fmRHJixo12aNadUe9D0MhHoJxgBmRiRDqlq0Tu-jjP5I2_wY-c7iVSMUJTFInXHKRJadBaavvllrLZ4t47nc9G-ustuqnT_i8vrkN95LJmL87EK6NoVzS_O84c6DySZDA + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 test-user + method: GET + uri: https://api.dev.documentcloud.org/api/addon_runs/27d5bff2-2ff7-4b2e-bf5b-20ee9f02a1fe/ + response: + body: + string: '{"detail":"Given token not valid for any token type","code":"token_not_valid","messages":[{"token_class":"AccessToken","token_type":"access","message":"Token + is invalid or expired"}]}' + headers: + Allow: + - GET, PUT, PATCH, DELETE, HEAD, OPTIONS + Connection: + - keep-alive + Content-Language: + - en + Content-Length: + - '183' + Content-Type: + - application/json + Cross-Origin-Opener-Policy: + - same-origin + Date: + - Tue, 19 May 2026 12:54:31 GMT + Referrer-Policy: + - same-origin + Server: + - nginx/1.29.8 + Vary: + - Accept, Origin, Accept-Language, Cookie + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - DENY + status: + code: 403 + message: Forbidden +- request: + body: '{"refresh": "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6MTc1ODgxNjUyOSwiaWF0IjoxNzU4NzMwMTI5LCJqdGkiOiIzZWI4MzExNmQzNTA0Njc0YTViZGFiOGEyYjFkNDA0ZiIsInVzZXJfaWQiOiJmNjAyOWU4YS1kOGEyLTQxODgtOGEyMS1jZDA3M2EwMmVlNWMiLCJhdWQiOlsic3F1YXJlbGV0IiwibXVja3JvY2siLCJkb2N1bWVudGNsb3VkIl0sImlzcyI6WyJzcXVhcmVsZXQiXX0.jl0ql4G-9ZFn0yOWSyzlXLjBXaQF9ZzWoHf7vxfrK9e4MnQ5jZyCLrFR7-nkjbKy9q9WAjlO3u3ZV3bzYW0xobiOuZcvTEiucy8qnzQlXLDLOjMy1JLnyh7VJI4Si40BSs5l-UfSvUv3854l6V_fxwcx0asLFVclT0PrDnAuNt50uxxgsSAwzrsquqPOASuG_6DHiD-DIE-MrWYiNc2Z5fy7eQFRt600oTOPRfLLmVixlqN33QfHO6GZQsM20vinJxyOXWvjtsGmcaJooxIkyU56HLObx6fxokzEGKzvHXLeF7zbrZuHaww8fPmFTtq-QjaY7Pt2vxmJnbBIIvRh-g"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '682' + Content-Type: + - application/json + User-Agent: + - python-requests/2.31.0 + method: POST + uri: https://dev.squarelet.com/api/refresh/ + response: + body: + string: '{"detail":"Token is invalid","code":"token_not_valid"}' + headers: + Allow: + - POST, OPTIONS + Connection: + - keep-alive + Content-Length: + - '54' + Content-Type: + - application/json + Cross-Origin-Opener-Policy: + - same-origin + Date: + - Tue, 19 May 2026 12:54:31 GMT + Referrer-Policy: + - same-origin + Server: + - nginx/1.29.8 + Server-Timing: + - TimerPanel_utime;dur=28.828999999973348;desc="User CPU time", TimerPanel_stime;dur=0.0;desc="System + CPU time", TimerPanel_total;dur=28.828999999973348;desc="Total CPU time", + TimerPanel_total_time;dur=39.24408298917115;desc="Elapsed time", SQLPanel_sql_time;dur=0.6609990086872131;desc="SQL + 3 queries", CachePanel_total_time;dur=0;desc="Cache 0 Calls" + Set-Cookie: + - op_browser_state=4b30254fc88d36c6b0a21c762b8607d5e34f30f158a79440cb58e2a0; + Path=/ + Vary: + - Accept, Origin, Cookie + WWW-Authenticate: + - Bearer realm="api" + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - DENY + djdt-store-id: + - 6cecf048e9574b87865d35848218f804 + status: + code: 401 + message: Unauthorized +- request: + body: '{"username": "test-user", "password": "test-password"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '54' + Content-Type: + - application/json + User-Agent: + - python-requests/2.31.0 + method: POST + uri: https://dev.squarelet.com/api/token/ + response: + body: + string: '{"refresh":"eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6MTc3OTI4MTY3MSwiaWF0IjoxNzc5MTk1MjcxLCJqdGkiOiI2Zjc4M2JjNWI1MTc0ZWQ2OGQwODQ3ZGM4NGQ2ZjM0ZCIsInVzZXJfaWQiOiJlMmRiNzJlNi01OWQwLTQzZTQtYTVmMC01NTNiY2E2ODBlMjIiLCJhdWQiOlsic3F1YXJlbGV0IiwibXVja3JvY2siLCJkb2N1bWVudGNsb3VkIl0sImlzcyI6InNxdWFyZWxldCJ9.QlB40rVEix4qjfXJjHYw_NuSvM-1WS-kEY-yxTMl-CJYvTcBob_9L8O-wMQ0oegrC9HrE2W4kCOJcJrtTfMYiNspILWa4mB04VGagwQuuQRO_kvk8km3A6wa5-Vg-P6rGOTac6iBmM1SKCVI_FQ58o8ZDAs0fhcteA7RUDWOvP3ihV51qre6_z4WlxpwO-pKcv9_DZ6uldXt5UI-eEkXYQKb4_MxHZEu_MN6QMRzFGxWNJ8OfvUfjjeT2k6BrcB3cVi9m-E0sNBmHLFNkPgVGU4gqCCop2ifiRFU1ItLYYPwWRzCYoJHshudCwMgHVSfT15JwA-OhtYr_bBvSN-7YA","access":"eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoiYWNjZXNzIiwiZXhwIjoxNzc5MTk1NTcxLCJpYXQiOjE3NzkxOTUyNzEsImp0aSI6IjQzYzM5MjBhN2JhZTQ3YWNhMGZhZmFkNDAwM2RhMzFhIiwidXNlcl9pZCI6ImUyZGI3MmU2LTU5ZDAtNDNlNC1hNWYwLTU1M2JjYTY4MGUyMiIsImF1ZCI6WyJzcXVhcmVsZXQiLCJtdWNrcm9jayIsImRvY3VtZW50Y2xvdWQiXSwiaXNzIjoic3F1YXJlbGV0In0.LWZGVfF5v7LgFFHvmo5q6i0hHMdytd5EhwvgIxCDj8N6UPKB-74JIUfIJhqJnAOiTGWlP3g-jUI3a8QLjIyRQjlVW03xwCeeNy0NWVBShdbdj8-s8X2JO1Xs8vMZ4YcyQhYUMK6Ow9ARzsAE7GDLQCh0zRROgbYuWv_ZJcEmN3RUGiJVpkLCiEu7vvfXUdb3Tn-1hnRW1mWxuU200u6hi-Of3hALQILuE-VB8bCaHABujpsy8sjZERukCIeNME52ZanvQxYeWb9DVgurehknPft_w5VXdYlO4webYJrgn0enbVm2Chw14iKjCP-6Xq6wivRz5SiwOsnfQGcU3bDoLA"}' + headers: + Allow: + - POST, OPTIONS + Connection: + - keep-alive + Content-Length: + - '1353' + Content-Type: + - application/json + Cross-Origin-Opener-Policy: + - same-origin + Date: + - Tue, 19 May 2026 12:54:31 GMT + Referrer-Policy: + - same-origin + Server: + - nginx/1.29.8 + Server-Timing: + - TimerPanel_utime;dur=284.20699999992394;desc="User CPU time", TimerPanel_stime;dur=412.77900000000045;desc="System + CPU time", TimerPanel_total;dur=696.9859999999244;desc="Total CPU time", TimerPanel_total_time;dur=237.52075000084005;desc="Elapsed + time", SQLPanel_sql_time;dur=14.25995901809074;desc="SQL 4 queries", CachePanel_total_time;dur=0;desc="Cache + 0 Calls" + Set-Cookie: + - op_browser_state=4b30254fc88d36c6b0a21c762b8607d5e34f30f158a79440cb58e2a0; + Path=/ + Vary: + - Accept, Origin, Cookie + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - DENY + djdt-store-id: + - aab09d28b4fa48cabacca76d0eb9ff1e + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Authorization: + - Bearer eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoiYWNjZXNzIiwiZXhwIjoxNzc5MTk1NTcxLCJpYXQiOjE3NzkxOTUyNzEsImp0aSI6IjQzYzM5MjBhN2JhZTQ3YWNhMGZhZmFkNDAwM2RhMzFhIiwidXNlcl9pZCI6ImUyZGI3MmU2LTU5ZDAtNDNlNC1hNWYwLTU1M2JjYTY4MGUyMiIsImF1ZCI6WyJzcXVhcmVsZXQiLCJtdWNrcm9jayIsImRvY3VtZW50Y2xvdWQiXSwiaXNzIjoic3F1YXJlbGV0In0.LWZGVfF5v7LgFFHvmo5q6i0hHMdytd5EhwvgIxCDj8N6UPKB-74JIUfIJhqJnAOiTGWlP3g-jUI3a8QLjIyRQjlVW03xwCeeNy0NWVBShdbdj8-s8X2JO1Xs8vMZ4YcyQhYUMK6Ow9ARzsAE7GDLQCh0zRROgbYuWv_ZJcEmN3RUGiJVpkLCiEu7vvfXUdb3Tn-1hnRW1mWxuU200u6hi-Of3hALQILuE-VB8bCaHABujpsy8sjZERukCIeNME52ZanvQxYeWb9DVgurehknPft_w5VXdYlO4webYJrgn0enbVm2Chw14iKjCP-6Xq6wivRz5SiwOsnfQGcU3bDoLA + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 test-user + method: GET + uri: https://api.dev.documentcloud.org/api/addon_runs/27d5bff2-2ff7-4b2e-bf5b-20ee9f02a1fe/ + response: + body: + string: '{"uuid":"27d5bff2-2ff7-4b2e-bf5b-20ee9f02a1fe","addon":10,"event":null,"user":100003,"status":"queued","progress":0,"message":"","file_url":null,"file_expires_at":null,"dismissed":false,"rating":0,"comment":"","credits_spent":0,"created_at":"2026-05-19T12:41:37.038271Z","updated_at":"2026-05-19T12:41:37.039319Z","data":{}}' + headers: + Allow: + - GET, PUT, PATCH, DELETE, HEAD, OPTIONS + Connection: + - keep-alive + Content-Language: + - en + Content-Length: + - '324' + Content-Type: + - application/json + Cross-Origin-Opener-Policy: + - same-origin + Date: + - Tue, 19 May 2026 12:54:31 GMT + Referrer-Policy: + - same-origin + Server: + - nginx/1.29.8 + Vary: + - Accept, Origin, Accept-Language, Cookie + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - DENY + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/test_addon/TestRunDataVCR.test_store_then_load_run_data_round_trip.yaml b/tests/cassettes/test_addon/TestRunDataVCR.test_store_then_load_run_data_round_trip.yaml new file mode 100644 index 0000000..5e98c5a --- /dev/null +++ b/tests/cassettes/test_addon/TestRunDataVCR.test_store_then_load_run_data_round_trip.yaml @@ -0,0 +1,98 @@ +interactions: +- request: + body: '{"data": {"foo": "bar", "n": 42}}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Authorization: + - Bearer eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoiYWNjZXNzIiwiZXhwIjoxNzc5MTk1NTcxLCJpYXQiOjE3NzkxOTUyNzEsImp0aSI6IjQzYzM5MjBhN2JhZTQ3YWNhMGZhZmFkNDAwM2RhMzFhIiwidXNlcl9pZCI6ImUyZGI3MmU2LTU5ZDAtNDNlNC1hNWYwLTU1M2JjYTY4MGUyMiIsImF1ZCI6WyJzcXVhcmVsZXQiLCJtdWNrcm9jayIsImRvY3VtZW50Y2xvdWQiXSwiaXNzIjoic3F1YXJlbGV0In0.LWZGVfF5v7LgFFHvmo5q6i0hHMdytd5EhwvgIxCDj8N6UPKB-74JIUfIJhqJnAOiTGWlP3g-jUI3a8QLjIyRQjlVW03xwCeeNy0NWVBShdbdj8-s8X2JO1Xs8vMZ4YcyQhYUMK6Ow9ARzsAE7GDLQCh0zRROgbYuWv_ZJcEmN3RUGiJVpkLCiEu7vvfXUdb3Tn-1hnRW1mWxuU200u6hi-Of3hALQILuE-VB8bCaHABujpsy8sjZERukCIeNME52ZanvQxYeWb9DVgurehknPft_w5VXdYlO4webYJrgn0enbVm2Chw14iKjCP-6Xq6wivRz5SiwOsnfQGcU3bDoLA + Connection: + - keep-alive + Content-Length: + - '33' + Content-Type: + - application/json + User-Agent: + - python-requests/2.31.0 test-user + method: PATCH + uri: https://api.dev.documentcloud.org/api/addon_runs/27d5bff2-2ff7-4b2e-bf5b-20ee9f02a1fe/ + response: + body: + string: '{"uuid":"27d5bff2-2ff7-4b2e-bf5b-20ee9f02a1fe","addon":10,"event":null,"user":100003,"status":"queued","progress":0,"message":"","file_url":null,"file_expires_at":null,"dismissed":false,"rating":0,"comment":"","credits_spent":0,"created_at":"2026-05-19T12:41:37.038271Z","updated_at":"2026-05-19T12:54:31.765179Z","data":{"foo":"bar","n":42}}' + headers: + Allow: + - GET, PUT, PATCH, DELETE, HEAD, OPTIONS + Connection: + - keep-alive + Content-Language: + - en + Content-Length: + - '342' + Content-Type: + - application/json + Cross-Origin-Opener-Policy: + - same-origin + Date: + - Tue, 19 May 2026 12:54:31 GMT + Referrer-Policy: + - same-origin + Server: + - nginx/1.29.8 + Vary: + - Accept, Origin, Accept-Language, Cookie + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - DENY + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Authorization: + - Bearer eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoiYWNjZXNzIiwiZXhwIjoxNzc5MTk1NTcxLCJpYXQiOjE3NzkxOTUyNzEsImp0aSI6IjQzYzM5MjBhN2JhZTQ3YWNhMGZhZmFkNDAwM2RhMzFhIiwidXNlcl9pZCI6ImUyZGI3MmU2LTU5ZDAtNDNlNC1hNWYwLTU1M2JjYTY4MGUyMiIsImF1ZCI6WyJzcXVhcmVsZXQiLCJtdWNrcm9jayIsImRvY3VtZW50Y2xvdWQiXSwiaXNzIjoic3F1YXJlbGV0In0.LWZGVfF5v7LgFFHvmo5q6i0hHMdytd5EhwvgIxCDj8N6UPKB-74JIUfIJhqJnAOiTGWlP3g-jUI3a8QLjIyRQjlVW03xwCeeNy0NWVBShdbdj8-s8X2JO1Xs8vMZ4YcyQhYUMK6Ow9ARzsAE7GDLQCh0zRROgbYuWv_ZJcEmN3RUGiJVpkLCiEu7vvfXUdb3Tn-1hnRW1mWxuU200u6hi-Of3hALQILuE-VB8bCaHABujpsy8sjZERukCIeNME52ZanvQxYeWb9DVgurehknPft_w5VXdYlO4webYJrgn0enbVm2Chw14iKjCP-6Xq6wivRz5SiwOsnfQGcU3bDoLA + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 test-user + method: GET + uri: https://api.dev.documentcloud.org/api/addon_runs/27d5bff2-2ff7-4b2e-bf5b-20ee9f02a1fe/ + response: + body: + string: '{"uuid":"27d5bff2-2ff7-4b2e-bf5b-20ee9f02a1fe","addon":10,"event":null,"user":100003,"status":"queued","progress":0,"message":"","file_url":null,"file_expires_at":null,"dismissed":false,"rating":0,"comment":"","credits_spent":0,"created_at":"2026-05-19T12:41:37.038271Z","updated_at":"2026-05-19T12:54:31.765179Z","data":{"n":42,"foo":"bar"}}' + headers: + Allow: + - GET, PUT, PATCH, DELETE, HEAD, OPTIONS + Connection: + - keep-alive + Content-Language: + - en + Content-Length: + - '342' + Content-Type: + - application/json + Cross-Origin-Opener-Policy: + - same-origin + Date: + - Tue, 19 May 2026 12:54:31 GMT + Referrer-Policy: + - same-origin + Server: + - nginx/1.29.8 + Vary: + - Accept, Origin, Accept-Language, Cookie + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - DENY + status: + code: 200 + message: OK +version: 1 diff --git a/tests/conftest.py b/tests/conftest.py index f8e81e5..06b4ef2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,5 @@ -# Future -from __future__ import division, print_function, unicode_literals - # Standard Library +import os import time from uuid import uuid4 @@ -150,6 +148,20 @@ def project(client, document_factory): project.delete() +DEFAULT_ADDON_RUN_ID = "27d5bff2-2ff7-4b2e-bf5b-20ee9f02a1fe" + + +@pytest.fixture(scope="session") +def addon_run(): + """Yield an AddOnRun UUID for VCR-based addon tests. + + Defaults to the UUID baked into the recorded cassettes so replay works + out of the box. Override via DC_TEST_ADDON_RUN_ID to re-record against a + different run on the dev server. + """ + yield os.environ.get("DC_TEST_ADDON_RUN_ID", DEFAULT_ADDON_RUN_ID) + + @pytest.fixture(scope="session") def project_factory(client): projects = [] diff --git a/tests/test_addon.py b/tests/test_addon.py new file mode 100644 index 0000000..e50391e --- /dev/null +++ b/tests/test_addon.py @@ -0,0 +1,141 @@ +# Standard Library +from unittest.mock import MagicMock + +# Third Party +import pytest + +# DocumentCloud +from documentcloud.addon import AddOn + +# pylint: disable=redefined-outer-name + + +@pytest.fixture +def addon(): + """An AddOn instance built without invoking argparse or constructing a real client. + + Tests can override `.id`, `.event_id`, `.client`, etc. as needed. + """ + instance = AddOn.__new__(AddOn) + instance.id = "run-123" + instance.addon_id = "addon-1" + instance.event_id = None + instance.documents = None + instance.query = None + instance.user_id = None + instance.org_id = None + instance.data = {} + instance.title = "Test AddOn" + instance.client = MagicMock() + return instance + + +class TestLoadRunData: + def test_returns_data_when_run_id_set(self, addon): + addon.client.get.return_value.json.return_value = {"data": {"foo": "bar"}} + + result = addon.load_run_data() + + addon.client.get.assert_called_once_with("addon_runs/run-123/") + assert result == {"foo": "bar"} + + def test_returns_empty_dict_when_no_run_id(self, addon): + addon.id = None + + assert addon.load_run_data() == {} + addon.client.get.assert_not_called() + + def test_returns_empty_dict_when_data_missing_from_response(self, addon): + addon.client.get.return_value.json.return_value = {} + + assert addon.load_run_data() == {} + + +class TestStoreRunData: + def test_patches_run_with_data(self, addon): + addon.store_run_data({"foo": "bar"}) + + addon.client.patch.assert_called_once_with( + "addon_runs/run-123/", json={"data": {"foo": "bar"}} + ) + + def test_no_op_when_no_run_id(self, addon, capsys): + addon.id = None + + result = addon.store_run_data({"foo": "bar"}) + + assert result is None + addon.client.patch.assert_not_called() + assert "Run ID not set" in capsys.readouterr().out + + def test_rejects_non_dict_data(self, addon): + with pytest.raises(TypeError): + addon.store_run_data("not a dict") + + addon.client.patch.assert_not_called() + + +class TestLoadEventData: + def test_returns_scratch_when_event_id_set(self, addon): + addon.event_id = "evt-9" + addon.client.get.return_value.json.return_value = {"scratch": {"x": 1}} + + result = addon.load_event_data() + + addon.client.get.assert_called_once_with("addon_events/evt-9/") + assert result == {"x": 1} + + def test_returns_none_when_no_event_id(self, addon): + assert addon.load_event_data() is None + addon.client.get.assert_not_called() + + +class TestStoreEventData: + def test_patches_event_with_scratch(self, addon): + addon.event_id = "evt-9" + + addon.store_event_data({"x": 1}) + + addon.client.patch.assert_called_once_with( + "addon_events/evt-9/", json={"scratch": {"x": 1}} + ) + + def test_no_op_when_no_event_id(self, addon): + assert addon.store_event_data({"x": 1}) is None + addon.client.patch.assert_not_called() + + +@pytest.fixture +def real_addon(client, addon_run): + """An AddOn wired to the real `client` fixture and a freshly created run.""" + instance = AddOn.__new__(AddOn) + instance.id = addon_run + instance.addon_id = None + instance.event_id = None + instance.documents = None + instance.query = None + instance.user_id = None + instance.org_id = None + instance.data = {} + instance.title = "Test AddOn" + instance.client = client + return instance + + +class TestRunDataVCR: + """VCR-recorded round-trip tests against the dev DC. + + Recording: set DC_TEST_ADDON_RUN_ID to an existing AddOnRun UUID on your + local dev DC, then run `make test-dev` (or `pytest --record-mode=new_episodes`). + """ + + def test_load_run_data_returns_dict(self, real_addon): + result = real_addon.load_run_data() + assert isinstance(result, dict) + + def test_store_then_load_run_data_round_trip(self, real_addon): + payload = {"foo": "bar", "n": 42} + real_addon.store_run_data(payload) + loaded = real_addon.load_run_data() + assert loaded.get("foo") == "bar" + assert loaded.get("n") == 42 diff --git a/tests/test_client.py b/tests/test_client.py index 7915f05..76e8bf3 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -9,6 +9,7 @@ import ratelimit # DocumentCloud +from documentcloud import DocumentCloud from documentcloud.constants import RATE_LIMIT from documentcloud.exceptions import APIError, CredentialsFailedError @@ -111,3 +112,77 @@ def test_expired_refresh_token(short_client, record_mode): assert short_client.users.get("me") # check the refresh token was updated assert old_refresh_token != short_client.refresh_token + + +def test_endpoint_rate_limit_burst_exhaustion(): + """Token bucket should block after burst capacity is exhausted""" + client = DocumentCloud() + # Exhaust the search burst (capacity=50) + _pattern_method, _pattern, limiter, bucket_key = client._endpoint_limiters[0] + for _ in range(50): + limiter.consume(bucket_key) + assert not limiter.consume(bucket_key) + + +def test_endpoint_rate_limit_method_specificity(): + """GET and POST to documents/ should use different limiters""" + client = DocumentCloud() + limiters = {(pm, p): lim for pm, p, lim, _ in client._endpoint_limiters} + assert limiters[("GET", "files/")] is not limiters[("POST", "documents/")] + + +def test_endpoint_rate_limit_pattern_ordering(): + """documents/search should match before documents/""" + client = DocumentCloud() + url = "documents/search/" + matched = next( + p for pm, p, _, _ in client._endpoint_limiters if pm == "GET" and p in url + ) + assert matched == "documents/search" + + +def test_asset_rate_limit_burst_exhaustion(): + """Asset token bucket should block after burst capacity is exhausted""" + client = DocumentCloud() + limiter = client.documents._asset_limiter + for _ in range(100): + limiter.consume("asset") + assert not limiter.consume("asset") + + +def test_asset_rate_limit_refills(): + """Asset token bucket should refill over time""" + client = DocumentCloud() + limiter = client.documents._asset_limiter + for _ in range(100): + limiter.consume("asset") + assert not limiter.consume("asset") + time.sleep(5) + assert limiter.consume("asset") + + +def test_endpoint_rate_limit_buckets_are_independent(): + """Exhausting one endpoint's bucket should not affect another""" + client = DocumentCloud() + limiters = {(pm, p): (lim, bk) for pm, p, lim, bk in client._endpoint_limiters} + search_limiter, search_key = limiters[("GET", "documents/search")] + files_limiter, files_key = limiters[("GET", "files/")] + + # Exhaust search bucket + for _ in range(50): + search_limiter.consume(search_key) + assert not search_limiter.consume(search_key) + + # Files bucket should still have tokens + assert files_limiter.consume(files_key) + + +def test_endpoint_rate_limit_no_match_for_unrecognized_url(): + """Unrecognized URLs should not match any endpoint limiter""" + client = DocumentCloud() + url = "users/me/" + matched = next( + (p for pm, p, _, _ in client._endpoint_limiters if p in url), + None, + ) + assert matched is None