From cabf2d9f502c765b6e2ef11da121c24b75536af5 Mon Sep 17 00:00:00 2001 From: duckduckgrayduck <102841251+duckduckgrayduck@users.noreply.github.com> Date: Fri, 24 Apr 2026 19:12:16 -0500 Subject: [PATCH 01/13] Point to batch upload script in docs --- docs/documents.rst | 7 +++++++ docs/gettingstarted.rst | 10 ++++++++++ 2 files changed, 17 insertions(+) diff --git a/docs/documents.rst b/docs/documents.rst index 01721b2..f021359 100644 --- a/docs/documents.rst +++ b/docs/documents.rst @@ -89,6 +89,13 @@ DocumentClient ``original_extension`` to the extension of the file type, such as ``docx`` or ``jpg``. + .. note:: + If you are looking to upload large sets of documents, consider using + the `batch upload script `_, + which is optimized for bulk uploads with built-in retry and error handling. + We have battle-tested the script on our CIA CREST Database which included + almost a million documents. We have `video guides `_ + on how to get started with the script on macOS, Windows, and Linux. .. method:: upload_directory(path, handle_errors=False, extensions=".pdf" **kwargs) diff --git a/docs/gettingstarted.rst b/docs/gettingstarted.rst index 4137a4d..c8f8ec3 100644 --- a/docs/gettingstarted.rst +++ b/docs/gettingstarted.rst @@ -87,6 +87,16 @@ You can also provide URLs that link to PDFs, if that's the kind of thing you're >>> client.documents.upload("http://ord.legistar.com/Chicago/attachments/e3a0cbcb-044d-4ec3-9848-23c5692b1943.pdf") + + +Uploading large sets of documents +---------------------------------- + +If you are looking to upload large sets of documents using the API, we *strongly* recommend using our +`batch upload script `_, which is optimized for bulk uploads and handles retries, +rate limiting, and error logging automatically. We have battle-tested the script on our CIA Crest Database which included almost a million documents. We have `video guides `_ on how to get started with the script on macOS, Windows and Linux. + + Uploading a document that is not a PDF ------------------------------------------------- From 86cd8b3be4019b81db0843765adbacb63001001b Mon Sep 17 00:00:00 2001 From: Chris Amico Date: Mon, 18 May 2026 16:07:23 -0400 Subject: [PATCH 02/13] Add methods to get and set run data --- .gitignore | 1 + documentcloud/addon.py | 20 ++++++++ tests/README.md | 58 ++++++++++++----------- tests/test_addon.py | 103 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 154 insertions(+), 28 deletions(-) create mode 100644 tests/test_addon.py diff --git a/.gitignore b/.gitignore index e2fec16..b52fe85 100644 --- a/.gitignore +++ b/.gitignore @@ -17,3 +17,4 @@ _build/ Pipfile Pipfile.lock .env +.DS_Store diff --git a/documentcloud/addon.py b/documentcloud/addon.py index 8053661..77dc79f 100644 --- a/documentcloud/addon.py +++ b/documentcloud/addon.py @@ -182,6 +182,26 @@ def upload_file(self, file): f"addon_runs/{self.id}/", json={"file_name": file_name} ) + def load_run_data(self): + "Load persistent data from this run" + if not self.id: + return {} + + response = self.client.get(f"addon_runs/{self.id}/") + response.raise_for_status() + return response.json().get("data", {}) + + def store_run_data(self, data): + "Store persistent data for this run" + if not self.id: + print("Run ID not set. Try again later or check if something went wrong.") + return + + if not isinstance(data, dict): + raise TypeError("Invalid data") + + return self.client.patch(f"addon_runs/{self.id}/", json=data) + def load_event_data(self): """Load persistent data for this event""" if not self.event_id: diff --git a/tests/README.md b/tests/README.md index b0ca885..f2da4c3 100644 --- a/tests/README.md +++ b/tests/README.md @@ -1,21 +1,21 @@ -This library interacts with the API for DocumentCloud. Running the test suite +This library interacts with the API for DocumentCloud. Running the test suite against the live server provides many challenges - it is slow, will not work without an internet connection, and could give false failures for intermitent -network failures. To resolve these issues we use VCR.py -(https://github.com/kevin1024/vcrpy) to record the HTTP requests. When new +network failures. To resolve these issues we use VCR.py +(https://github.com/kevin1024/vcrpy) to record the HTTP requests. When new HTTP requests are needed for the tests, they are recorded against a -localinstance of the server. Then the tests can be run against the +localinstance of the server. Then the tests can be run against the pre-recorded responses, quickly and without interacting with any other software. To record the HTTP requests, you must have a local dev environment of Squarelet -and DocumentCloud running. You can find instructions for those here: +and DocumentCloud running. You can find instructions for those here: https://github.com/MuckRock/squarelet and -https://github.com/MuckRock/DocumentCloud. +https://github.com/MuckRock/DocumentCloud. You should create a test user locally, with the username `test-user` and password `test-password`. -There are some tests which require the access and refresh tokens to be expired. To accomodate this, those tests are expected to be run with the local Squarelet instance configured with very short lifetimes for those tokens. You should record the regular tests, change the settings, run the short tests, then change the settings back. The settings to change are located in `config/settings/base.py` in the Squarelet code base. Find the follow lines and uncomment the second two: +There are some tests which require the access and refresh tokens to be expired. To accomodate this, those tests are expected to be run with the local Squarelet instance configured with very short lifetimes for those tokens. You should record the regular tests, change the settings, run the short tests, then change the settings back. The settings to change are located in `config/settings/base.py` in the Squarelet code base. Find the follow lines and uncomment the second two: ``` # These are used for testing token expiration @@ -23,7 +23,7 @@ There are some tests which require the access and refresh tokens to be expired. # "REFRESH_TOKEN_LIFETIME": timedelta(seconds=5), ``` -There is a Makefile included to help run the tests. The following commands are available: +There is a Makefile included to help run the tests. The following commands are available: `test-clean` - This will clean all of the pre-recorded requests for the non-short tests @@ -33,9 +33,9 @@ There is a Makefile included to help run the tests. The following commands are `test-create-short` - This will clean the short tests and then run all of them and record the HTTP requests. -`test` - run all tests using the pre-recorded HTTP requests. If an HTTP request is missing, it will fail. +`test` - run all tests using the pre-recorded HTTP requests. If an HTTP request is missing, it will fail. -`test-dev` - run all tests using the pre-recorded HTTP requests. If an HTTP request is missing, record it. +`test-dev` - run all tests using the pre-recorded HTTP requests. If an HTTP request is missing, record it. `tox` - run all tests under multiple Python versions using tox. @@ -45,38 +45,40 @@ There is a Makefile included to help run the tests. The following commands are `ship` - Release a new version of the library on PyPI. -A normal workflow would be to use `test-create` to create the intial saved requests, or if you want to re-record all of them for some reason. You would then change the short settings as described above, and run `test-create-short`. Running `test` should now pass while making no actual HTTP requests. If you add a new test with a new request, you can run `test-dev` to record just the new request while leaving the existing ones in place. The saved requests should be checked in to git. +A normal workflow would be to use `test-create` to create the intial saved requests, or if you want to re-record all of them for some reason. You would then change the short settings as described above, and run `test-create-short`. Running `test` should now pass while making no actual HTTP requests. If you add a new test with a new request, you can run `test-dev` to record just the new request while leaving the existing ones in place. The saved requests should be checked in to git. ## Troubleshooting -### Token Errors +### Token Errors + If you receive a lot of errors that are 405's with E requests.exceptions.HTTPError: 405 Client Error: Method Not Allowed for url: https://dev.squarelet.com/api/token/ -You need to change the BASE_URI and AUTH_URI to https. +You need to change the BASE_URI and AUTH_URI to https. ### SSL Errors -If you receive a bunch of 500/SSL errors when running the tests, it is likely that your local dev environment doesn't have access to the necessary certificates to authenticate with your local DocumentCloud environment. -You can copy the .PEM/.CRT file that is inside the docker container to your local environment and pass this file in so make the SSL errors go away. -To resolve this you will want to have your local DocumentCloud environment running, including the Django container. You can find the container ID of a running container by running -```docker ps``` +If you receive a bunch of 500/SSL errors when running the tests, it is likely that your local dev environment doesn't have access to the necessary certificates to authenticate with your local DocumentCloud environment. +You can copy the .PEM/.CRT file that is inside the docker container to your local environment and pass this file in so make the SSL errors go away. + +To resolve this you will want to have your local DocumentCloud environment running, including the Django container. You can find the container ID of a running container by running +`docker ps` Then, to retrieve the certificate run: -```docker cp container_id_here:/etc/ssl/certs/ca-certificates.crt ~``` -Substitute ~ with the location where you would like to copy the certificate file, as ~ is the home directory. +`docker cp container_id_here:/etc/ssl/certs/ca-certificates.crt ~` +Substitute ~ with the location where you would like to copy the certificate file, as ~ is the home directory. -You can then run the full test-suite by re-recording results and passing in the necessary certificate like so: -```REQUESTS_CA_BUNDLE=/path/to/ca-certificates.crt make test-create``` -/path/to should be replaced by the actual location. +You can then run the full test-suite by re-recording results and passing in the necessary certificate like so: +`REQUESTS_CA_BUNDLE=/path/to/ca-certificates.crt make test-create` +/path/to should be replaced by the actual location. ### Assertion Errors -If you get a failure for the contributor method, it is because you need to set a full name for the test user within Squarelet. +If you get a failure for the contributor method, it is because you need to set a full name for the test user within Squarelet. If you receive the following failure: -```assert len(list(all_documents)) > len(list(my_documents.results))``` -It is because you need to have another user created on your local dev environment on Squarelet, have them verified, and have them upload at least one document. -This tests asserts that the total sum of documents in your local dev environment is larger than those owned by you. This wouldn't hold true if your test user was the only user who has uploaded a document. +`assert len(list(all_documents)) > len(list(my_documents.results))` +It is because you need to have another user created on your local dev environment on Squarelet, have them verified, and have them upload at least one document. +This tests asserts that the total sum of documents in your local dev environment is larger than those owned by you. This wouldn't hold true if your test user was the only user who has uploaded a document. If you receive this similar assertion failure: -```assert len(all_projects.results) > len(my_projects.results)``` -You will need to have that other user create a project as well. This is to pass this assertion. +`assert len(all_projects.results) > len(my_projects.results)` +You will need to have that other user create a project as well. This is to pass this assertion. diff --git a/tests/test_addon.py b/tests/test_addon.py new file mode 100644 index 0000000..2e8da9c --- /dev/null +++ b/tests/test_addon.py @@ -0,0 +1,103 @@ +# Standard Library +from unittest.mock import MagicMock + +# Third Party +import pytest + +# DocumentCloud +from documentcloud.addon import AddOn + + +@pytest.fixture +def addon(): + """An AddOn instance built without invoking argparse or constructing a real client. + + Tests can override `.id`, `.event_id`, `.client`, etc. as needed. + """ + instance = AddOn.__new__(AddOn) + instance.id = "run-123" + instance.addon_id = "addon-1" + instance.event_id = None + instance.documents = None + instance.query = None + instance.user_id = None + instance.org_id = None + instance.data = {} + instance.title = "Test AddOn" + instance.client = MagicMock() + return instance + + +class TestLoadRunData: + def test_returns_data_when_run_id_set(self, addon): + addon.client.get.return_value.json.return_value = {"data": {"foo": "bar"}} + + result = addon.load_run_data() + + addon.client.get.assert_called_once_with("addon_runs/run-123/") + assert result == {"foo": "bar"} + + def test_returns_empty_dict_when_no_run_id(self, addon): + addon.id = None + + assert addon.load_run_data() == {} + addon.client.get.assert_not_called() + + def test_returns_empty_dict_when_data_missing_from_response(self, addon): + addon.client.get.return_value.json.return_value = {} + + assert addon.load_run_data() == {} + + +class TestStoreRunData: + def test_patches_run_with_data(self, addon): + addon.store_run_data({"foo": "bar"}) + + addon.client.patch.assert_called_once_with( + "addon_runs/run-123/", json={"foo": "bar"} + ) + + def test_no_op_when_no_run_id(self, addon, capsys): + addon.id = None + + result = addon.store_run_data({"foo": "bar"}) + + assert result is None + addon.client.patch.assert_not_called() + assert "Run ID not set" in capsys.readouterr().out + + def test_rejects_non_dict_data(self, addon): + with pytest.raises(TypeError): + addon.store_run_data("not a dict") + + addon.client.patch.assert_not_called() + + +class TestLoadEventData: + def test_returns_scratch_when_event_id_set(self, addon): + addon.event_id = "evt-9" + addon.client.get.return_value.json.return_value = {"scratch": {"x": 1}} + + result = addon.load_event_data() + + addon.client.get.assert_called_once_with("addon_events/evt-9/") + assert result == {"x": 1} + + def test_returns_none_when_no_event_id(self, addon): + assert addon.load_event_data() is None + addon.client.get.assert_not_called() + + +class TestStoreEventData: + def test_patches_event_with_scratch(self, addon): + addon.event_id = "evt-9" + + addon.store_event_data({"x": 1}) + + addon.client.patch.assert_called_once_with( + "addon_events/evt-9/", json={"scratch": {"x": 1}} + ) + + def test_no_op_when_no_event_id(self, addon): + assert addon.store_event_data({"x": 1}) is None + addon.client.patch.assert_not_called() From 941e8dcdec5d649762063bf69e6288f5aed11ca5 Mon Sep 17 00:00:00 2001 From: Chris Amico Date: Mon, 18 May 2026 16:16:32 -0400 Subject: [PATCH 03/13] Fix linting --- .isort.cfg | 1 - .pylintrc | 3 --- Makefile | 4 ++-- documentcloud/addon.py | 2 +- documentcloud/documents.py | 2 ++ documentcloud/exceptions.py | 15 +++++++++------ tests/test_addon.py | 2 ++ 7 files changed, 16 insertions(+), 13 deletions(-) diff --git a/.isort.cfg b/.isort.cfg index 888b42d..b5a3b30 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -3,7 +3,6 @@ line_length=88 force_to_top= skip= skip_glob= -not_skip=__init__.py known_future_library=__future__ known_first_party=documentcloud indent=' ' diff --git a/.pylintrc b/.pylintrc index d75dfc4..3276277 100644 --- a/.pylintrc +++ b/.pylintrc @@ -4,6 +4,3 @@ good-names=i,x1,x2,y1,y2,id [MESSAGES CONTROL] disable=missing-docstring,too-many-ancestors,too-few-public-methods,no-else-return,no-member,attribute-defined-outside-init,similarities,import-outside-toplevel,cyclic-import,no-member,no-else-raise,too-many-instance-attributes,too-many-arguments,ungrouped-imports,useless-object-inheritance,no-else-continue - -[DESIGN] -max-positional-arguments=10 \ No newline at end of file diff --git a/Makefile b/Makefile index 966dc8a..b7a8ac1 100644 --- a/Makefile +++ b/Makefile @@ -41,10 +41,10 @@ coverage: check: pylint documentcloud black documentcloud - isort -rc documentcloud + isort documentcloud pylint tests black tests - isort -rc tests + isort tests # release a new version of the package to PyPI ship: diff --git a/documentcloud/addon.py b/documentcloud/addon.py index 77dc79f..03ea3a7 100644 --- a/documentcloud/addon.py +++ b/documentcloud/addon.py @@ -195,7 +195,7 @@ def store_run_data(self, data): "Store persistent data for this run" if not self.id: print("Run ID not set. Try again later or check if something went wrong.") - return + return None if not isinstance(data, dict): raise TypeError("Invalid data") diff --git a/documentcloud/documents.py b/documentcloud/documents.py index f968cac..dd78fbd 100644 --- a/documentcloud/documents.py +++ b/documentcloud/documents.py @@ -24,8 +24,10 @@ from .users import User try: + # Standard Library from urllib.parse import urlparse except ImportError: + # Third Party from urlparse import urlparse logger = logging.getLogger("documentcloud") diff --git a/documentcloud/exceptions.py b/documentcloud/exceptions.py index b26fe9a..ded0fca 100644 --- a/documentcloud/exceptions.py +++ b/documentcloud/exceptions.py @@ -2,11 +2,14 @@ Custom exceptions for python-documentcloud """ +# Third Party # pylint: disable=unused-import # Import exceptions from python-squarelet -from squarelet.exceptions import SquareletError as DocumentCloudError -from squarelet.exceptions import DuplicateObjectError -from squarelet.exceptions import CredentialsFailedError -from squarelet.exceptions import APIError -from squarelet.exceptions import DoesNotExistError -from squarelet.exceptions import MultipleObjectsReturnedError +from squarelet.exceptions import ( + APIError, + CredentialsFailedError, + DoesNotExistError, + DuplicateObjectError, + MultipleObjectsReturnedError, + SquareletError as DocumentCloudError, +) diff --git a/tests/test_addon.py b/tests/test_addon.py index 2e8da9c..818da25 100644 --- a/tests/test_addon.py +++ b/tests/test_addon.py @@ -7,6 +7,8 @@ # DocumentCloud from documentcloud.addon import AddOn +# pylint: disable=redefined-outer-name + @pytest.fixture def addon(): From 1bbe4440366b76148ebbf546436a4b2dc6d2745b Mon Sep 17 00:00:00 2001 From: Chris Amico Date: Tue, 19 May 2026 09:00:20 -0400 Subject: [PATCH 04/13] Add cassettes for addon runs --- documentcloud/addon.py | 2 +- ...taVCR.test_load_run_data_returns_dict.yaml | 209 ++++++++++++++++++ ...t_store_then_load_run_data_round_trip.yaml | 98 ++++++++ tests/conftest.py | 20 +- tests/test_addon.py | 38 +++- 5 files changed, 361 insertions(+), 6 deletions(-) create mode 100644 tests/cassettes/test_addon/TestRunDataVCR.test_load_run_data_returns_dict.yaml create mode 100644 tests/cassettes/test_addon/TestRunDataVCR.test_store_then_load_run_data_round_trip.yaml diff --git a/documentcloud/addon.py b/documentcloud/addon.py index 03ea3a7..4b0e14e 100644 --- a/documentcloud/addon.py +++ b/documentcloud/addon.py @@ -200,7 +200,7 @@ def store_run_data(self, data): if not isinstance(data, dict): raise TypeError("Invalid data") - return self.client.patch(f"addon_runs/{self.id}/", json=data) + return self.client.patch(f"addon_runs/{self.id}/", json={"data": data}) def load_event_data(self): """Load persistent data for this event""" diff --git a/tests/cassettes/test_addon/TestRunDataVCR.test_load_run_data_returns_dict.yaml b/tests/cassettes/test_addon/TestRunDataVCR.test_load_run_data_returns_dict.yaml new file mode 100644 index 0000000..0dc49d0 --- /dev/null +++ b/tests/cassettes/test_addon/TestRunDataVCR.test_load_run_data_returns_dict.yaml @@ -0,0 +1,209 @@ +interactions: +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Authorization: + - Bearer eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJ0b2tlbl90eXBlIjoiYWNjZXNzIiwiZXhwIjoxNzU4NzMwNDI5LCJpYXQiOjE3NTg3MzAxMjksImp0aSI6IjEyMzE1OGFkOWExYjQ3YTE5ZDdhMjdlYjMwNTdmMzRkIiwidXNlcl9pZCI6ImY2MDI5ZThhLWQ4YTItNDE4OC04YTIxLWNkMDczYTAyZWU1YyIsImF1ZCI6WyJzcXVhcmVsZXQiLCJtdWNrcm9jayIsImRvY3VtZW50Y2xvdWQiXSwiaXNzIjpbInNxdWFyZWxldCJdfQ.LduwXStVTHJGEoNw9eO-xQ0OVLF6b-kaeq-qXOCGJ06mc6JygXWOE7D9NnV7m-PzdtTKCGyQtOCe1RvU51ho5yFNJUwlLcYlmABbOQnvWOuuyqYvzoGXwULm99lV8KWW2Aetmldo_pUU3LY0VJS4Qe13ZaussP1Emsr68_uvy99FU8Xkm_jQqt4qoUDWuj-IeXkVj024Mv70VrCOBiTs_fmRHJixo12aNadUe9D0MhHoJxgBmRiRDqlq0Tu-jjP5I2_wY-c7iVSMUJTFInXHKRJadBaavvllrLZ4t47nc9G-ustuqnT_i8vrkN95LJmL87EK6NoVzS_O84c6DySZDA + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 test-user + method: GET + uri: https://api.dev.documentcloud.org/api/addon_runs/27d5bff2-2ff7-4b2e-bf5b-20ee9f02a1fe/ + response: + body: + string: '{"detail":"Given token not valid for any token type","code":"token_not_valid","messages":[{"token_class":"AccessToken","token_type":"access","message":"Token + is invalid or expired"}]}' + headers: + Allow: + - GET, PUT, PATCH, DELETE, HEAD, OPTIONS + Connection: + - keep-alive + Content-Language: + - en + Content-Length: + - '183' + Content-Type: + - application/json + Cross-Origin-Opener-Policy: + - same-origin + Date: + - Tue, 19 May 2026 12:54:31 GMT + Referrer-Policy: + - same-origin + Server: + - nginx/1.29.8 + Vary: + - Accept, Origin, Accept-Language, Cookie + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - DENY + status: + code: 403 + message: Forbidden +- request: + body: '{"refresh": "eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6MTc1ODgxNjUyOSwiaWF0IjoxNzU4NzMwMTI5LCJqdGkiOiIzZWI4MzExNmQzNTA0Njc0YTViZGFiOGEyYjFkNDA0ZiIsInVzZXJfaWQiOiJmNjAyOWU4YS1kOGEyLTQxODgtOGEyMS1jZDA3M2EwMmVlNWMiLCJhdWQiOlsic3F1YXJlbGV0IiwibXVja3JvY2siLCJkb2N1bWVudGNsb3VkIl0sImlzcyI6WyJzcXVhcmVsZXQiXX0.jl0ql4G-9ZFn0yOWSyzlXLjBXaQF9ZzWoHf7vxfrK9e4MnQ5jZyCLrFR7-nkjbKy9q9WAjlO3u3ZV3bzYW0xobiOuZcvTEiucy8qnzQlXLDLOjMy1JLnyh7VJI4Si40BSs5l-UfSvUv3854l6V_fxwcx0asLFVclT0PrDnAuNt50uxxgsSAwzrsquqPOASuG_6DHiD-DIE-MrWYiNc2Z5fy7eQFRt600oTOPRfLLmVixlqN33QfHO6GZQsM20vinJxyOXWvjtsGmcaJooxIkyU56HLObx6fxokzEGKzvHXLeF7zbrZuHaww8fPmFTtq-QjaY7Pt2vxmJnbBIIvRh-g"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '682' + Content-Type: + - application/json + User-Agent: + - python-requests/2.31.0 + method: POST + uri: https://dev.squarelet.com/api/refresh/ + response: + body: + string: '{"detail":"Token is invalid","code":"token_not_valid"}' + headers: + Allow: + - POST, OPTIONS + Connection: + - keep-alive + Content-Length: + - '54' + Content-Type: + - application/json + Cross-Origin-Opener-Policy: + - same-origin + Date: + - Tue, 19 May 2026 12:54:31 GMT + Referrer-Policy: + - same-origin + Server: + - nginx/1.29.8 + Server-Timing: + - TimerPanel_utime;dur=28.828999999973348;desc="User CPU time", TimerPanel_stime;dur=0.0;desc="System + CPU time", TimerPanel_total;dur=28.828999999973348;desc="Total CPU time", + TimerPanel_total_time;dur=39.24408298917115;desc="Elapsed time", SQLPanel_sql_time;dur=0.6609990086872131;desc="SQL + 3 queries", CachePanel_total_time;dur=0;desc="Cache 0 Calls" + Set-Cookie: + - op_browser_state=4b30254fc88d36c6b0a21c762b8607d5e34f30f158a79440cb58e2a0; + Path=/ + Vary: + - Accept, Origin, Cookie + WWW-Authenticate: + - Bearer realm="api" + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - DENY + djdt-store-id: + - 6cecf048e9574b87865d35848218f804 + status: + code: 401 + message: Unauthorized +- request: + body: '{"username": "test-user", "password": "test-password"}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Connection: + - keep-alive + Content-Length: + - '54' + Content-Type: + - application/json + User-Agent: + - python-requests/2.31.0 + method: POST + uri: https://dev.squarelet.com/api/token/ + response: + body: + string: '{"refresh":"eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6MTc3OTI4MTY3MSwiaWF0IjoxNzc5MTk1MjcxLCJqdGkiOiI2Zjc4M2JjNWI1MTc0ZWQ2OGQwODQ3ZGM4NGQ2ZjM0ZCIsInVzZXJfaWQiOiJlMmRiNzJlNi01OWQwLTQzZTQtYTVmMC01NTNiY2E2ODBlMjIiLCJhdWQiOlsic3F1YXJlbGV0IiwibXVja3JvY2siLCJkb2N1bWVudGNsb3VkIl0sImlzcyI6InNxdWFyZWxldCJ9.QlB40rVEix4qjfXJjHYw_NuSvM-1WS-kEY-yxTMl-CJYvTcBob_9L8O-wMQ0oegrC9HrE2W4kCOJcJrtTfMYiNspILWa4mB04VGagwQuuQRO_kvk8km3A6wa5-Vg-P6rGOTac6iBmM1SKCVI_FQ58o8ZDAs0fhcteA7RUDWOvP3ihV51qre6_z4WlxpwO-pKcv9_DZ6uldXt5UI-eEkXYQKb4_MxHZEu_MN6QMRzFGxWNJ8OfvUfjjeT2k6BrcB3cVi9m-E0sNBmHLFNkPgVGU4gqCCop2ifiRFU1ItLYYPwWRzCYoJHshudCwMgHVSfT15JwA-OhtYr_bBvSN-7YA","access":"eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoiYWNjZXNzIiwiZXhwIjoxNzc5MTk1NTcxLCJpYXQiOjE3NzkxOTUyNzEsImp0aSI6IjQzYzM5MjBhN2JhZTQ3YWNhMGZhZmFkNDAwM2RhMzFhIiwidXNlcl9pZCI6ImUyZGI3MmU2LTU5ZDAtNDNlNC1hNWYwLTU1M2JjYTY4MGUyMiIsImF1ZCI6WyJzcXVhcmVsZXQiLCJtdWNrcm9jayIsImRvY3VtZW50Y2xvdWQiXSwiaXNzIjoic3F1YXJlbGV0In0.LWZGVfF5v7LgFFHvmo5q6i0hHMdytd5EhwvgIxCDj8N6UPKB-74JIUfIJhqJnAOiTGWlP3g-jUI3a8QLjIyRQjlVW03xwCeeNy0NWVBShdbdj8-s8X2JO1Xs8vMZ4YcyQhYUMK6Ow9ARzsAE7GDLQCh0zRROgbYuWv_ZJcEmN3RUGiJVpkLCiEu7vvfXUdb3Tn-1hnRW1mWxuU200u6hi-Of3hALQILuE-VB8bCaHABujpsy8sjZERukCIeNME52ZanvQxYeWb9DVgurehknPft_w5VXdYlO4webYJrgn0enbVm2Chw14iKjCP-6Xq6wivRz5SiwOsnfQGcU3bDoLA"}' + headers: + Allow: + - POST, OPTIONS + Connection: + - keep-alive + Content-Length: + - '1353' + Content-Type: + - application/json + Cross-Origin-Opener-Policy: + - same-origin + Date: + - Tue, 19 May 2026 12:54:31 GMT + Referrer-Policy: + - same-origin + Server: + - nginx/1.29.8 + Server-Timing: + - TimerPanel_utime;dur=284.20699999992394;desc="User CPU time", TimerPanel_stime;dur=412.77900000000045;desc="System + CPU time", TimerPanel_total;dur=696.9859999999244;desc="Total CPU time", TimerPanel_total_time;dur=237.52075000084005;desc="Elapsed + time", SQLPanel_sql_time;dur=14.25995901809074;desc="SQL 4 queries", CachePanel_total_time;dur=0;desc="Cache + 0 Calls" + Set-Cookie: + - op_browser_state=4b30254fc88d36c6b0a21c762b8607d5e34f30f158a79440cb58e2a0; + Path=/ + Vary: + - Accept, Origin, Cookie + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - DENY + djdt-store-id: + - aab09d28b4fa48cabacca76d0eb9ff1e + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Authorization: + - Bearer eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoiYWNjZXNzIiwiZXhwIjoxNzc5MTk1NTcxLCJpYXQiOjE3NzkxOTUyNzEsImp0aSI6IjQzYzM5MjBhN2JhZTQ3YWNhMGZhZmFkNDAwM2RhMzFhIiwidXNlcl9pZCI6ImUyZGI3MmU2LTU5ZDAtNDNlNC1hNWYwLTU1M2JjYTY4MGUyMiIsImF1ZCI6WyJzcXVhcmVsZXQiLCJtdWNrcm9jayIsImRvY3VtZW50Y2xvdWQiXSwiaXNzIjoic3F1YXJlbGV0In0.LWZGVfF5v7LgFFHvmo5q6i0hHMdytd5EhwvgIxCDj8N6UPKB-74JIUfIJhqJnAOiTGWlP3g-jUI3a8QLjIyRQjlVW03xwCeeNy0NWVBShdbdj8-s8X2JO1Xs8vMZ4YcyQhYUMK6Ow9ARzsAE7GDLQCh0zRROgbYuWv_ZJcEmN3RUGiJVpkLCiEu7vvfXUdb3Tn-1hnRW1mWxuU200u6hi-Of3hALQILuE-VB8bCaHABujpsy8sjZERukCIeNME52ZanvQxYeWb9DVgurehknPft_w5VXdYlO4webYJrgn0enbVm2Chw14iKjCP-6Xq6wivRz5SiwOsnfQGcU3bDoLA + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 test-user + method: GET + uri: https://api.dev.documentcloud.org/api/addon_runs/27d5bff2-2ff7-4b2e-bf5b-20ee9f02a1fe/ + response: + body: + string: '{"uuid":"27d5bff2-2ff7-4b2e-bf5b-20ee9f02a1fe","addon":10,"event":null,"user":100003,"status":"queued","progress":0,"message":"","file_url":null,"file_expires_at":null,"dismissed":false,"rating":0,"comment":"","credits_spent":0,"created_at":"2026-05-19T12:41:37.038271Z","updated_at":"2026-05-19T12:41:37.039319Z","data":{}}' + headers: + Allow: + - GET, PUT, PATCH, DELETE, HEAD, OPTIONS + Connection: + - keep-alive + Content-Language: + - en + Content-Length: + - '324' + Content-Type: + - application/json + Cross-Origin-Opener-Policy: + - same-origin + Date: + - Tue, 19 May 2026 12:54:31 GMT + Referrer-Policy: + - same-origin + Server: + - nginx/1.29.8 + Vary: + - Accept, Origin, Accept-Language, Cookie + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - DENY + status: + code: 200 + message: OK +version: 1 diff --git a/tests/cassettes/test_addon/TestRunDataVCR.test_store_then_load_run_data_round_trip.yaml b/tests/cassettes/test_addon/TestRunDataVCR.test_store_then_load_run_data_round_trip.yaml new file mode 100644 index 0000000..5e98c5a --- /dev/null +++ b/tests/cassettes/test_addon/TestRunDataVCR.test_store_then_load_run_data_round_trip.yaml @@ -0,0 +1,98 @@ +interactions: +- request: + body: '{"data": {"foo": "bar", "n": 42}}' + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Authorization: + - Bearer eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoiYWNjZXNzIiwiZXhwIjoxNzc5MTk1NTcxLCJpYXQiOjE3NzkxOTUyNzEsImp0aSI6IjQzYzM5MjBhN2JhZTQ3YWNhMGZhZmFkNDAwM2RhMzFhIiwidXNlcl9pZCI6ImUyZGI3MmU2LTU5ZDAtNDNlNC1hNWYwLTU1M2JjYTY4MGUyMiIsImF1ZCI6WyJzcXVhcmVsZXQiLCJtdWNrcm9jayIsImRvY3VtZW50Y2xvdWQiXSwiaXNzIjoic3F1YXJlbGV0In0.LWZGVfF5v7LgFFHvmo5q6i0hHMdytd5EhwvgIxCDj8N6UPKB-74JIUfIJhqJnAOiTGWlP3g-jUI3a8QLjIyRQjlVW03xwCeeNy0NWVBShdbdj8-s8X2JO1Xs8vMZ4YcyQhYUMK6Ow9ARzsAE7GDLQCh0zRROgbYuWv_ZJcEmN3RUGiJVpkLCiEu7vvfXUdb3Tn-1hnRW1mWxuU200u6hi-Of3hALQILuE-VB8bCaHABujpsy8sjZERukCIeNME52ZanvQxYeWb9DVgurehknPft_w5VXdYlO4webYJrgn0enbVm2Chw14iKjCP-6Xq6wivRz5SiwOsnfQGcU3bDoLA + Connection: + - keep-alive + Content-Length: + - '33' + Content-Type: + - application/json + User-Agent: + - python-requests/2.31.0 test-user + method: PATCH + uri: https://api.dev.documentcloud.org/api/addon_runs/27d5bff2-2ff7-4b2e-bf5b-20ee9f02a1fe/ + response: + body: + string: '{"uuid":"27d5bff2-2ff7-4b2e-bf5b-20ee9f02a1fe","addon":10,"event":null,"user":100003,"status":"queued","progress":0,"message":"","file_url":null,"file_expires_at":null,"dismissed":false,"rating":0,"comment":"","credits_spent":0,"created_at":"2026-05-19T12:41:37.038271Z","updated_at":"2026-05-19T12:54:31.765179Z","data":{"foo":"bar","n":42}}' + headers: + Allow: + - GET, PUT, PATCH, DELETE, HEAD, OPTIONS + Connection: + - keep-alive + Content-Language: + - en + Content-Length: + - '342' + Content-Type: + - application/json + Cross-Origin-Opener-Policy: + - same-origin + Date: + - Tue, 19 May 2026 12:54:31 GMT + Referrer-Policy: + - same-origin + Server: + - nginx/1.29.8 + Vary: + - Accept, Origin, Accept-Language, Cookie + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - DENY + status: + code: 200 + message: OK +- request: + body: null + headers: + Accept: + - '*/*' + Accept-Encoding: + - gzip, deflate + Authorization: + - Bearer eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoiYWNjZXNzIiwiZXhwIjoxNzc5MTk1NTcxLCJpYXQiOjE3NzkxOTUyNzEsImp0aSI6IjQzYzM5MjBhN2JhZTQ3YWNhMGZhZmFkNDAwM2RhMzFhIiwidXNlcl9pZCI6ImUyZGI3MmU2LTU5ZDAtNDNlNC1hNWYwLTU1M2JjYTY4MGUyMiIsImF1ZCI6WyJzcXVhcmVsZXQiLCJtdWNrcm9jayIsImRvY3VtZW50Y2xvdWQiXSwiaXNzIjoic3F1YXJlbGV0In0.LWZGVfF5v7LgFFHvmo5q6i0hHMdytd5EhwvgIxCDj8N6UPKB-74JIUfIJhqJnAOiTGWlP3g-jUI3a8QLjIyRQjlVW03xwCeeNy0NWVBShdbdj8-s8X2JO1Xs8vMZ4YcyQhYUMK6Ow9ARzsAE7GDLQCh0zRROgbYuWv_ZJcEmN3RUGiJVpkLCiEu7vvfXUdb3Tn-1hnRW1mWxuU200u6hi-Of3hALQILuE-VB8bCaHABujpsy8sjZERukCIeNME52ZanvQxYeWb9DVgurehknPft_w5VXdYlO4webYJrgn0enbVm2Chw14iKjCP-6Xq6wivRz5SiwOsnfQGcU3bDoLA + Connection: + - keep-alive + User-Agent: + - python-requests/2.31.0 test-user + method: GET + uri: https://api.dev.documentcloud.org/api/addon_runs/27d5bff2-2ff7-4b2e-bf5b-20ee9f02a1fe/ + response: + body: + string: '{"uuid":"27d5bff2-2ff7-4b2e-bf5b-20ee9f02a1fe","addon":10,"event":null,"user":100003,"status":"queued","progress":0,"message":"","file_url":null,"file_expires_at":null,"dismissed":false,"rating":0,"comment":"","credits_spent":0,"created_at":"2026-05-19T12:41:37.038271Z","updated_at":"2026-05-19T12:54:31.765179Z","data":{"n":42,"foo":"bar"}}' + headers: + Allow: + - GET, PUT, PATCH, DELETE, HEAD, OPTIONS + Connection: + - keep-alive + Content-Language: + - en + Content-Length: + - '342' + Content-Type: + - application/json + Cross-Origin-Opener-Policy: + - same-origin + Date: + - Tue, 19 May 2026 12:54:31 GMT + Referrer-Policy: + - same-origin + Server: + - nginx/1.29.8 + Vary: + - Accept, Origin, Accept-Language, Cookie + X-Content-Type-Options: + - nosniff + X-Frame-Options: + - DENY + status: + code: 200 + message: OK +version: 1 diff --git a/tests/conftest.py b/tests/conftest.py index f8e81e5..c7ece84 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,7 +1,5 @@ -# Future -from __future__ import division, print_function, unicode_literals - # Standard Library +import os import time from uuid import uuid4 @@ -19,7 +17,7 @@ USERNAME = "test-user" PASSWORD = "test-password" TIMEOUT = 2.0 -DEFAULT_DOCUMENT_URI = "https://assets.documentcloud.org/documents/20071460/test.pdf" +DEFAULT_DOCUMENT_URI = "https://s3.documentcloud.org/documents/20071460/test.pdf" # pylint: disable=redefined-outer-name @@ -150,6 +148,20 @@ def project(client, document_factory): project.delete() +DEFAULT_ADDON_RUN_ID = "27d5bff2-2ff7-4b2e-bf5b-20ee9f02a1fe" + + +@pytest.fixture(scope="session") +def addon_run(): + """Yield an AddOnRun UUID for VCR-based addon tests. + + Defaults to the UUID baked into the recorded cassettes so replay works + out of the box. Override via DC_TEST_ADDON_RUN_ID to re-record against a + different run on the dev server. + """ + yield os.environ.get("DC_TEST_ADDON_RUN_ID", DEFAULT_ADDON_RUN_ID) + + @pytest.fixture(scope="session") def project_factory(client): projects = [] diff --git a/tests/test_addon.py b/tests/test_addon.py index 818da25..e50391e 100644 --- a/tests/test_addon.py +++ b/tests/test_addon.py @@ -56,7 +56,7 @@ def test_patches_run_with_data(self, addon): addon.store_run_data({"foo": "bar"}) addon.client.patch.assert_called_once_with( - "addon_runs/run-123/", json={"foo": "bar"} + "addon_runs/run-123/", json={"data": {"foo": "bar"}} ) def test_no_op_when_no_run_id(self, addon, capsys): @@ -103,3 +103,39 @@ def test_patches_event_with_scratch(self, addon): def test_no_op_when_no_event_id(self, addon): assert addon.store_event_data({"x": 1}) is None addon.client.patch.assert_not_called() + + +@pytest.fixture +def real_addon(client, addon_run): + """An AddOn wired to the real `client` fixture and a freshly created run.""" + instance = AddOn.__new__(AddOn) + instance.id = addon_run + instance.addon_id = None + instance.event_id = None + instance.documents = None + instance.query = None + instance.user_id = None + instance.org_id = None + instance.data = {} + instance.title = "Test AddOn" + instance.client = client + return instance + + +class TestRunDataVCR: + """VCR-recorded round-trip tests against the dev DC. + + Recording: set DC_TEST_ADDON_RUN_ID to an existing AddOnRun UUID on your + local dev DC, then run `make test-dev` (or `pytest --record-mode=new_episodes`). + """ + + def test_load_run_data_returns_dict(self, real_addon): + result = real_addon.load_run_data() + assert isinstance(result, dict) + + def test_store_then_load_run_data_round_trip(self, real_addon): + payload = {"foo": "bar", "n": 42} + real_addon.store_run_data(payload) + loaded = real_addon.load_run_data() + assert loaded.get("foo") == "bar" + assert loaded.get("n") == 42 From d476a85855e8e6204d4ce9e6215a1831028c597a Mon Sep 17 00:00:00 2001 From: Chris Amico Date: Tue, 19 May 2026 09:09:17 -0400 Subject: [PATCH 05/13] Fix tests and upgrade workflow deps --- .github/workflows/main.yml | 24 ++++++++++++------------ tests/conftest.py | 2 +- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 61ab675..8452afb 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -16,17 +16,17 @@ jobs: python-version: ["3.10", "3.11"] steps: - name: Check out code - uses: actions/checkout@v2 - + uses: actions/checkout@v6 + - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v4 + uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - + - name: Install dependencies run: | pip install vcrpy pytest==7.4.2 requests pytest-mock python-documentcloud pytest-xdist pytest-recording python-squarelet - + - name: Run pre-recorded tests run: | make test @@ -36,17 +36,17 @@ jobs: runs-on: ubuntu-latest steps: - name: Check out code - uses: actions/checkout@v2 + uses: actions/checkout@v6 - - name: Set up Python 3.8 - uses: actions/setup-python@v4 + - name: Set up Python 3.11 + uses: actions/setup-python@v6 with: - python-version: 3.8 - + python-version: "3.11" + - name: Install dependencies for imports run: | - pip install python-dateutil requests urllib3 fastjsonschema ratelimit listcrunch pyyaml pytest vcrpy python-squarelet - + pip install python-dateutil requests urllib3 fastjsonschema ratelimit listcrunch pyyaml pytest vcrpy python-squarelet + - name: Install pylint and black run: | pip install pylint black diff --git a/tests/conftest.py b/tests/conftest.py index c7ece84..06b4ef2 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -17,7 +17,7 @@ USERNAME = "test-user" PASSWORD = "test-password" TIMEOUT = 2.0 -DEFAULT_DOCUMENT_URI = "https://s3.documentcloud.org/documents/20071460/test.pdf" +DEFAULT_DOCUMENT_URI = "https://assets.documentcloud.org/documents/20071460/test.pdf" # pylint: disable=redefined-outer-name From ace99ae4717b863e95244cf7fe8bcc7919cd62a8 Mon Sep 17 00:00:00 2001 From: Chris Amico Date: Tue, 19 May 2026 09:17:25 -0400 Subject: [PATCH 06/13] lint --- .pylintrc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.pylintrc b/.pylintrc index 3276277..eb60c8c 100644 --- a/.pylintrc +++ b/.pylintrc @@ -3,4 +3,4 @@ max-line-length=88 good-names=i,x1,x2,y1,y2,id [MESSAGES CONTROL] -disable=missing-docstring,too-many-ancestors,too-few-public-methods,no-else-return,no-member,attribute-defined-outside-init,similarities,import-outside-toplevel,cyclic-import,no-member,no-else-raise,too-many-instance-attributes,too-many-arguments,ungrouped-imports,useless-object-inheritance,no-else-continue +disable=missing-docstring,too-many-ancestors,too-few-public-methods,no-else-return,no-member,attribute-defined-outside-init,similarities,import-outside-toplevel,cyclic-import,no-member,no-else-raise,too-many-instance-attributes,too-many-arguments,too-many-positional-arguments,ungrouped-imports,useless-object-inheritance,no-else-continue From 9d90ba065ca951b1f06898ff63d68748a65c045f Mon Sep 17 00:00:00 2001 From: Chris Amico Date: Tue, 19 May 2026 09:20:36 -0400 Subject: [PATCH 07/13] py3k --- documentcloud/documents.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/documentcloud/documents.py b/documentcloud/documents.py index dd78fbd..d126f9d 100644 --- a/documentcloud/documents.py +++ b/documentcloud/documents.py @@ -9,6 +9,7 @@ import re import warnings from functools import partial +from urllib.parse import urlparse # Third Party from requests.exceptions import RequestException @@ -23,13 +24,6 @@ from .toolbox import grouper, is_url, merge_dicts, requests_retry_session from .users import User -try: - # Standard Library - from urllib.parse import urlparse -except ImportError: - # Third Party - from urlparse import urlparse - logger = logging.getLogger("documentcloud") IMAGE_SIZES = ["thumbnail", "small", "normal", "large", "xlarge"] From eb4416070038084944a31009d873661f584f27cf Mon Sep 17 00:00:00 2001 From: duckduckgrayduck <102841251+duckduckgrayduck@users.noreply.github.com> Date: Tue, 19 May 2026 09:57:00 -0500 Subject: [PATCH 08/13] Bump version to 4.6.0 --- docs/changelog.rst | 8 ++++++++ docs/conf.py | 6 +++--- setup.py | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index 1844ea8..8cea322 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,6 +1,14 @@ Changelog --------- +4.6.0 +~~~~~ +* Added load_run_data and store_run_data on the Add-On class to access AddOn run data. + +4.5.0 +~~~~~ +* Added OCR handling to upload and process methods. + 4.4.1 ~~~~~ * Fixes access to xlarge images. diff --git a/docs/conf.py b/docs/conf.py index 16adc2c..7c76388 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -48,16 +48,16 @@ # General information about the project. project = "documentcloud" -copyright = "2025, MuckRock Foundation" +copyright = "2026, MuckRock Foundation" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # # The short X.Y version. -version = "4.5" +version = "4.6" # The full version, including alpha/beta/rc tags. -release = "4.5.0" +release = "4.6.0" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/setup.py b/setup.py index 9c15513..a76b8ed 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name="python-documentcloud", - version="4.5.0", + version="4.6.0", description="A simple Python wrapper for the DocumentCloud API", author="Mitchell Kotler", author_email="mitch@muckrock.com", From 0c18b54867a9dbc8fef4ec714368a6f86f30a724 Mon Sep 17 00:00:00 2001 From: duckduckgrayduck <102841251+duckduckgrayduck@users.noreply.github.com> Date: Fri, 12 Jun 2026 12:44:07 -0500 Subject: [PATCH 09/13] Add sane rate limits,tests for rate limits and fix workflow --- .github/workflows/main.yml | 28 +++----------- documentcloud/client.py | 46 ++++++++++++++++++++++- documentcloud/documents.py | 35 ++++++++++++++++-- setup.py | 2 + tests/test_client.py | 75 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 159 insertions(+), 27 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 8452afb..e818a7e 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -1,5 +1,4 @@ name: Run Pre-recorded Tests - on: pull_request: branches: @@ -7,7 +6,6 @@ on: push: branches: - master - jobs: run-tests: runs-on: ubuntu-latest @@ -17,40 +15,26 @@ jobs: steps: - name: Check out code uses: actions/checkout@v6 - - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v6 with: python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - pip install vcrpy pytest==7.4.2 requests pytest-mock python-documentcloud pytest-xdist pytest-recording python-squarelet - + run: pip install -e ".[test]" - name: Run pre-recorded tests - run: | - make test - working-directory: . - + run: make test pylint-and-black: runs-on: ubuntu-latest steps: - name: Check out code uses: actions/checkout@v6 - - name: Set up Python 3.11 uses: actions/setup-python@v6 with: python-version: "3.11" - - - name: Install dependencies for imports - run: | - pip install python-dateutil requests urllib3 fastjsonschema ratelimit listcrunch pyyaml pytest vcrpy python-squarelet - - - name: Install pylint and black - run: | - pip install pylint black - + - name: Install dependencies + run: pip install -e ".[dev,test]" - name: Run pylint and black on ./documentcloud and ./tests run: | - pylint ./documentcloud ./tests; black --check ./documentcloud ./tests + pylint ./documentcloud ./tests + black --check ./documentcloud ./tests \ No newline at end of file diff --git a/documentcloud/client.py b/documentcloud/client.py index 1acee53..04ae838 100644 --- a/documentcloud/client.py +++ b/documentcloud/client.py @@ -1,12 +1,12 @@ -# Import SquareletClient from python-squarelet # Standard Library import logging +import time # Third Party +import token_bucket from squarelet import SquareletClient # Local -# Local Imports from .documents import DocumentClient from .organizations import OrganizationClient from .projects import ProjectClient @@ -14,6 +14,22 @@ logger = logging.getLogger("documentcloud") +# Per-endpoint rate limits applied on top of the global squarelet limit. +# Format: (method, url_pattern, rate_per_second, capacity) +# +# Endpoint Rate Burst Notes +# -------- ---- ----- ----- +# GET documents/search 15/min 50 +# POST documents/ 12/min 100 25 docs/bulk call = up to 300 docs/min +# PUT documents/ 12/min 100 25 docs/bulk call = up to 300 docs/min +# GET files/ 15/min 100 PDFs, full text, and other private assets +ENDPOINT_RATE_LIMITS = [ + ("GET", "documents/search", 15 / 60, 50), + ("POST", "documents/", 12 / 60, 100), + ("PUT", "documents/", 12 / 60, 100), + ("GET", "files/", 15 / 60, 100), +] + class DocumentCloud(SquareletClient): """ @@ -51,8 +67,34 @@ def __init__( else: logger.addHandler(logging.NullHandler()) + # Build per-endpoint token bucket rate limiters + storage = token_bucket.MemoryStorage() + self._endpoint_limiters = [ + ( + pattern_method, + pattern, + token_bucket.Limiter(rate=rate, capacity=capacity, storage=storage), + f"{pattern_method}:{pattern}", + ) + for pattern_method, pattern, rate, capacity in ENDPOINT_RATE_LIMITS + ] + # Initialize the sub-clients using SquareletClient self.documents = DocumentClient(self) self.projects = ProjectClient(self) self.users = UserClient(self) self.organizations = OrganizationClient(self) + + def request(self, method, url, raise_error=True, **kwargs): + for pattern_method, pattern, limiter, bucket_key in self._endpoint_limiters: + if pattern_method.upper() == method.upper() and pattern in url: + if not limiter.consume(bucket_key): + logger.warning( + "Rate limit reached for %s %s, throttling...", + method.upper(), + pattern, + ) + while not limiter.consume(bucket_key): + time.sleep(0.1) + return super().request(method, url, raise_error=raise_error, **kwargs) + return super().request(method, url, raise_error=raise_error, **kwargs) diff --git a/documentcloud/documents.py b/documentcloud/documents.py index d126f9d..0a7dd65 100644 --- a/documentcloud/documents.py +++ b/documentcloud/documents.py @@ -7,11 +7,13 @@ import logging import os import re +import time import warnings from functools import partial from urllib.parse import urlparse # Third Party +import token_bucket from requests.exceptions import RequestException # Local @@ -28,6 +30,8 @@ IMAGE_SIZES = ["thumbnail", "small", "normal", "large", "xlarge"] +DEFAULT_USER_AGENT = "python-documentcloud" + class Document(BaseAPIObject): """A single DocumentCloud document""" @@ -164,12 +168,17 @@ def _get_url(self, url, fmt=None): if base_netloc == url_netloc: # if the url host is the same as the base api host, - # sent the request with the client in order to include + # send the request with the client in order to include # authentication credentials response = self._client.get(url, full_url=True) else: - response = requests_retry_session().get( - url, headers={"User-Agent": "python-documentcloud2"} + response = self._client.documents.asset_get( + url, + headers={ + "User-Agent": self._client.session.headers.get( + "User-Agent", DEFAULT_USER_AGENT + ) + }, ) if fmt == "text": return response.content.decode("utf8") @@ -246,6 +255,26 @@ class DocumentClient(BaseAPIClient): api_path = "documents" resource = Document + def __init__(self, client): + super().__init__(client) + # Rate limit for public document asset fetches (S3-hosted). + # Private document assets go through the API client and are limited there. + # Token bucket: burst of 100, sustained at 15/min (0.25/sec). + storage = token_bucket.MemoryStorage() + self._asset_limiter = token_bucket.Limiter( + rate=15 / 60, + capacity=100, + storage=storage, + ) + self._asset_session = requests_retry_session() + + def asset_get(self, url, **kwargs): + if not self._asset_limiter.consume("asset"): + logger.warning("Rate limit reached for asset fetch, throttling...") + while not self._asset_limiter.consume("asset"): + time.sleep(0.1) + return self._asset_session.get(url, **kwargs) + def search(self, query, **params): """Return documents matching a search query""" diff --git a/setup.py b/setup.py index a76b8ed..a55cc7f 100644 --- a/setup.py +++ b/setup.py @@ -27,6 +27,7 @@ "pyyaml", "fastjsonschema", "python-squarelet", + "token-bucket", ), extras_require={ "dev": [ @@ -40,6 +41,7 @@ "test": [ "pytest", "pytest-mock", + "pytest-xdist", "pytest-recording", "vcrpy", ], diff --git a/tests/test_client.py b/tests/test_client.py index 7915f05..76e8bf3 100644 --- a/tests/test_client.py +++ b/tests/test_client.py @@ -9,6 +9,7 @@ import ratelimit # DocumentCloud +from documentcloud import DocumentCloud from documentcloud.constants import RATE_LIMIT from documentcloud.exceptions import APIError, CredentialsFailedError @@ -111,3 +112,77 @@ def test_expired_refresh_token(short_client, record_mode): assert short_client.users.get("me") # check the refresh token was updated assert old_refresh_token != short_client.refresh_token + + +def test_endpoint_rate_limit_burst_exhaustion(): + """Token bucket should block after burst capacity is exhausted""" + client = DocumentCloud() + # Exhaust the search burst (capacity=50) + _pattern_method, _pattern, limiter, bucket_key = client._endpoint_limiters[0] + for _ in range(50): + limiter.consume(bucket_key) + assert not limiter.consume(bucket_key) + + +def test_endpoint_rate_limit_method_specificity(): + """GET and POST to documents/ should use different limiters""" + client = DocumentCloud() + limiters = {(pm, p): lim for pm, p, lim, _ in client._endpoint_limiters} + assert limiters[("GET", "files/")] is not limiters[("POST", "documents/")] + + +def test_endpoint_rate_limit_pattern_ordering(): + """documents/search should match before documents/""" + client = DocumentCloud() + url = "documents/search/" + matched = next( + p for pm, p, _, _ in client._endpoint_limiters if pm == "GET" and p in url + ) + assert matched == "documents/search" + + +def test_asset_rate_limit_burst_exhaustion(): + """Asset token bucket should block after burst capacity is exhausted""" + client = DocumentCloud() + limiter = client.documents._asset_limiter + for _ in range(100): + limiter.consume("asset") + assert not limiter.consume("asset") + + +def test_asset_rate_limit_refills(): + """Asset token bucket should refill over time""" + client = DocumentCloud() + limiter = client.documents._asset_limiter + for _ in range(100): + limiter.consume("asset") + assert not limiter.consume("asset") + time.sleep(5) + assert limiter.consume("asset") + + +def test_endpoint_rate_limit_buckets_are_independent(): + """Exhausting one endpoint's bucket should not affect another""" + client = DocumentCloud() + limiters = {(pm, p): (lim, bk) for pm, p, lim, bk in client._endpoint_limiters} + search_limiter, search_key = limiters[("GET", "documents/search")] + files_limiter, files_key = limiters[("GET", "files/")] + + # Exhaust search bucket + for _ in range(50): + search_limiter.consume(search_key) + assert not search_limiter.consume(search_key) + + # Files bucket should still have tokens + assert files_limiter.consume(files_key) + + +def test_endpoint_rate_limit_no_match_for_unrecognized_url(): + """Unrecognized URLs should not match any endpoint limiter""" + client = DocumentCloud() + url = "users/me/" + matched = next( + (p for pm, p, _, _ in client._endpoint_limiters if p in url), + None, + ) + assert matched is None From c3995d0b5514332252a9b4a6d8ab5bef81120866 Mon Sep 17 00:00:00 2001 From: duckduckgrayduck <102841251+duckduckgrayduck@users.noreply.github.com> Date: Mon, 15 Jun 2026 18:05:09 -0500 Subject: [PATCH 10/13] Bump version and update changelog --- docs/changelog.rst | 4 ++++ docs/conf.py | 4 ++-- setup.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index 8cea322..8520b34 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,6 +1,10 @@ Changelog --------- +4.7.0 +~~~~~ +* Added burst-based sane rate limits to several endpoints. + 4.6.0 ~~~~~ * Added load_run_data and store_run_data on the Add-On class to access AddOn run data. diff --git a/docs/conf.py b/docs/conf.py index 7c76388..e80e36c 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -55,9 +55,9 @@ # built documents. # # The short X.Y version. -version = "4.6" +version = "4.7" # The full version, including alpha/beta/rc tags. -release = "4.6.0" +release = "4.7.0" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/setup.py b/setup.py index a55cc7f..fb58793 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name="python-documentcloud", - version="4.6.0", + version="4.7.0", description="A simple Python wrapper for the DocumentCloud API", author="Mitchell Kotler", author_email="mitch@muckrock.com", From bab56e579dc3160a0903e2c781bc286d778c41b9 Mon Sep 17 00:00:00 2001 From: Julie Lee <5377598+synapticlee@users.noreply.github.com> Date: Mon, 29 Jun 2026 16:26:24 -0400 Subject: [PATCH 11/13] Tiny typo - y2 is there twice --- documentcloud/annotations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/documentcloud/annotations.py b/documentcloud/annotations.py index 01b1457..c02290d 100644 --- a/documentcloud/annotations.py +++ b/documentcloud/annotations.py @@ -79,7 +79,7 @@ def create( x2=None, y2=None, ): - coords = [x1, y2, x2, y2] + coords = [x1, y1, x2, y2] if not (all(c is None for c in coords) or all(c is not None for c in coords)): raise ValueError( "x1, y2, x2, y2 must either all be None or all be not None" From 5e20d20024c22c44a4ad01a9ab421c4e88be393c Mon Sep 17 00:00:00 2001 From: duckduckgrayduck <102841251+duckduckgrayduck@users.noreply.github.com> Date: Mon, 29 Jun 2026 16:49:41 -0500 Subject: [PATCH 12/13] Bump to 4.8.0 --- docs/changelog.rst | 4 ++++ docs/conf.py | 4 ++-- setup.py | 2 +- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index 8520b34..b732023 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,6 +1,10 @@ Changelog --------- +4.8.0 +~~~~~ +* Fixes coordinates for annotations. Credit: @synapticlee + 4.7.0 ~~~~~ * Added burst-based sane rate limits to several endpoints. diff --git a/docs/conf.py b/docs/conf.py index e80e36c..927ef91 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -55,9 +55,9 @@ # built documents. # # The short X.Y version. -version = "4.7" +version = "4.8" # The full version, including alpha/beta/rc tags. -release = "4.7.0" +release = "4.8.0" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/setup.py b/setup.py index fb58793..39a1809 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name="python-documentcloud", - version="4.7.0", + version="4.8.0", description="A simple Python wrapper for the DocumentCloud API", author="Mitchell Kotler", author_email="mitch@muckrock.com", From ca615a45946d6a08e5f66516a77bae13adbab66b Mon Sep 17 00:00:00 2001 From: duckduckgrayduck <102841251+duckduckgrayduck@users.noreply.github.com> Date: Mon, 29 Jun 2026 17:10:27 -0500 Subject: [PATCH 13/13] 4.8.1 --- docs/changelog.rst | 2 +- docs/conf.py | 2 +- setup.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index b732023..39939f0 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -1,7 +1,7 @@ Changelog --------- -4.8.0 +4.8.1 ~~~~~ * Fixes coordinates for annotations. Credit: @synapticlee diff --git a/docs/conf.py b/docs/conf.py index 927ef91..faf0f84 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -57,7 +57,7 @@ # The short X.Y version. version = "4.8" # The full version, including alpha/beta/rc tags. -release = "4.8.0" +release = "4.8.1" # The language for content autogenerated by Sphinx. Refer to documentation # for a list of supported languages. diff --git a/setup.py b/setup.py index 39a1809..e30b375 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ setup( name="python-documentcloud", - version="4.8.0", + version="4.8.1", description="A simple Python wrapper for the DocumentCloud API", author="Mitchell Kotler", author_email="mitch@muckrock.com",