diff --git a/.github/dependabot.yml b/.github/dependabot.yml
new file mode 100644
index 0000000..47ebd5e
--- /dev/null
+++ b/.github/dependabot.yml
@@ -0,0 +1,27 @@
+# To get started with Dependabot version updates, you'll need to specify which
+# package ecosystems to update and where the package manifests are located.
+# Please see the documentation for all configuration options:
+# https://docs.github.com/code-security/dependabot/dependabot-version-updates/configuration-options-for-the-dependabot.yml-file
+
+version: 2
+updates:
+ # Enable version updates for GitHub Actions
+ - package-ecosystem: "github-actions"
+ groups:
+ actions:
+ patterns:
+ - "*"
+ directory: "/"
+ schedule:
+ # Check for updates to GitHub Actions every week
+ interval: "weekly"
+
+ # Enable version updates for pre-commit hooks
+ - package-ecosystem: "pre-commit"
+ directory: "/"
+ schedule:
+ interval: "weekly"
+ groups:
+ pre-commit:
+ patterns:
+ - "*"
diff --git a/.github/workflows/check.yml b/.github/workflows/check.yml
index 3d48c6f..e46401c 100644
--- a/.github/workflows/check.yml
+++ b/.github/workflows/check.yml
@@ -6,6 +6,10 @@ on:
permissions:
contents: read
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
jobs:
check:
runs-on: ubuntu-latest
@@ -16,23 +20,27 @@ jobs:
steps:
- uses: actions/checkout@v5
- - name: Set up Python 3.12
- uses: actions/setup-python@v6
- with:
- python-version: "3.12"
- name: Install uv
- uses: astral-sh/setup-uv@v6
+ uses: astral-sh/setup-uv@v8.1.0
with:
+ python-version: "3.12"
enable-cache: true
cache-dependency-glob: "pyproject.toml"
- name: Install package with check dependencies
- run: uv sync --extra check
+ run: uv sync --group check
+
+ - name: Run Ruff linter
+ uses: astral-sh/ruff-action@v3
+ with:
+ args: "check --output-format=github"
- # check with ruff
- - name: Run ruff
- run: uv run ruff check
+ - name: Run Ruff formatter
+ # NOTE: ruff format does not currently support github output format
+ run: ruff format --check --diff src tests
+ # Check formatting even if the previous step failed
+ if: always()
# check docs build
- name: Check that documentation builds with no errors or warnings
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
index 6e42013..2a008cf 100644
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -11,6 +11,8 @@ name: Upload Python Package
on:
release:
types: [published]
+ # allow manually running on main
+ workflow_dispatch:
permissions:
contents: read
diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml
index fc93c1b..806b68f 100644
--- a/.github/workflows/unit_tests.yml
+++ b/.github/workflows/unit_tests.yml
@@ -4,6 +4,10 @@ permissions:
contents: read
id-token: write
+concurrency:
+ group: ${{ github.workflow }}-${{ github.ref }}
+ cancel-in-progress: true
+
on:
push:
branches:
@@ -34,23 +38,23 @@ jobs:
# use github python action instead of uv to take advantage of caching
- name: Set up Python ${{ matrix.python }}
- uses: actions/setup-python@v6
+ uses: astral-sh/setup-uv@v8.1.0
with:
python-version: ${{ matrix.python }}
cache: 'pip'
cache-dependency-path: '**/pyproject.toml'
- - name: Install package with dependencies
- run: pip install -e ".[test]"
+ - name: Install package with check dependencies
+ run: uv sync --group test
# for all versions but the one we use for code coverage, run normally
- name: Run unit tests without code coverage
- run: pytest
+ run: uv run pytest
if: ${{ matrix.python != env.COV_PYTHON_VERSION }}
# run code coverage in one version only
- name: Run unit tests with code coverage reporting
- run: pytest --cov=.
+ run: uv run pytest --cov=.
if: ${{ matrix.python == env.COV_PYTHON_VERSION }}
- name: Upload coverage to Codecov
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e49e09d..873c6fd 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,22 +1,46 @@
files: \.py
repos:
+ # ruff for linting and formatting python
- repo: https://github.com/astral-sh/ruff-pre-commit
- rev: v0.3.4
+ rev: v0.15.12
hooks:
- - id: ruff
- args: [ --fix, --exit-non-zero-on-fix ]
+ - id: ruff-check
+ args: [ --fix, --show-fixes, --exit-non-zero-on-fix ]
- id: ruff-format
- repo: https://github.com/pre-commit/pre-commit-hooks
- rev: v4.3.0
+ rev: v6.0.0
hooks:
- id: check-case-conflict
+ - id: check-merge-conflict
- id: check-executables-have-shebangs
- id: debug-statements
- id: end-of-file-fixer
- id: mixed-line-ending
- id: trailing-whitespace
+ - id: check-yaml
+ - id: name-tests-test
+ args: [--pytest-test-first]
+
- repo: https://github.com/pre-commit/mirrors-mypy
- rev: v1.13.0
+ rev: v2.0.0
hooks:
- id: mypy
additional_dependencies: [numpy]
+ # yamlfmt for formatting YAML files
+ - repo: https://github.com/google/yamlfmt
+ rev: v0.21.0
+ hooks:
+ - id: yamlfmt
+ # Codespell for spell checking
+ - repo: https://github.com/codespell-project/codespell
+ rev: v2.4.2
+ hooks:
+ - id: codespell
+ additional_dependencies:
+ - tomli
+ exclude_types: ["css", "html", "javascript", "json"]
+ # Validate GitHub Actions workflow files
+ - repo: https://github.com/mpalmer/action-validator
+ rev: v0.9.0
+ hooks:
+ - id: action-validator
diff --git a/.readthedocs.yaml b/.readthedocs.yaml
index a2bd875..250396f 100644
--- a/.readthedocs.yaml
+++ b/.readthedocs.yaml
@@ -11,8 +11,8 @@ sphinx:
configuration: docs/conf.py
python:
- install:
- - method: pip
- path: .
- extra_requirements:
- - docs
+ install:
+ - method: uv
+ command: sync
+ groups:
+ - docs
diff --git a/DEVELOPER_NOTES.md b/DEVELOPER_NOTES.md
index c7db002..6ea99d6 100644
--- a/DEVELOPER_NOTES.md
+++ b/DEVELOPER_NOTES.md
@@ -44,9 +44,15 @@ source .venv/bin/activate
Install an editable version of the local package along with python dependencies needed for testing and development.
```sh
-pip install -e ".[dev]"
+pip install -e . --group=dev
```
+If using `uv`, use
+
+```sh
+ uv sync --group test
+ ```
+
### Install pre-commit hooks
We use [pre-commit](https://pre-commit.com/) for automated checks and consistent formatting. If you're planning to contribute, please install these when you set up your local development.
diff --git a/examples/edtf-support.ipynb b/examples/edtf-support.ipynb
index a604838..5cc7e20 100644
--- a/examples/edtf-support.ipynb
+++ b/examples/edtf-support.ipynb
@@ -34,12 +34,12 @@
"### Date\n",
"\n",
"```\n",
- "complete representation: [year][“-”][month][“-”][day]\n",
- "Example 1 ‘1985-04-12’ refers to the calendar date 1985 April 12th with day precision.\n",
- "reduced precision for year and month: [year][“-”][month]\n",
- "Example 2 ‘1985-04’ refers to the calendar month 1985 April with month precision.\n",
+ "complete representation: [year][-][month][-][day]\n",
+ "Example 1 1985-04-12 refers to the calendar date 1985 April 12th with day precision.\n",
+ "reduced precision for year and month: [year][-][month]\n",
+ "Example 2 1985-04 refers to the calendar month 1985 April with month precision.\n",
"reduced precision for year: [year]\n",
- "Example 3 ‘1985’ refers to the calendar year 1985 with year precision.\n",
+ "Example 3 1985 refers to the calendar year 1985 with year precision.\n",
"```"
]
},
@@ -60,9 +60,9 @@
"metadata": {},
"outputs": [],
"source": [
- "import datetime \n",
+ "import datetime\n",
"\n",
- "from undate import Undate, UndateInterval, DatePrecision\n",
+ "from undate import DatePrecision, Undate, UndateInterval\n",
"\n",
"# Example 1: day\n",
"day = Undate.parse(\"1985-04-12\", \"EDTF\")\n",
@@ -97,8 +97,8 @@
"metadata": {},
"outputs": [],
"source": [
- "from undate.undate import Undate, DatePrecision\n",
"from undate.converters.edtf import EDTFDateConverter\n",
+ "from undate.undate import DatePrecision, Undate\n",
"\n",
"# set default format to EDTF\n",
"Undate.DEFAULT_CONVERTER = \"EDTF\"\n",
@@ -140,12 +140,12 @@
"EDTF Level 0 adopts representations of a time interval where both the start and end are dates: start and end date only; that is, both start and duration, and duration and end, are excluded. Time of day is excluded.\n",
"\n",
"```\n",
- " Example 1 ‘1964/2008’ is a time interval with calendar year precision, beginning sometime in 1964 and ending sometime in 2008.\n",
- " Example 2 ‘2004-06/2006-08’ is a time interval with calendar month precision, beginning sometime in June 2004 and ending sometime in August of 2006.\n",
- " Example 3 ‘2004-02-01/2005-02-08’ is a time interval with calendar day precision, beginning sometime on February 1, 2004 and ending sometime on February 8, 2005.\n",
- " Example 4 ‘2004-02-01/2005-02’ is a time interval beginning sometime on February 1, 2004 and ending sometime in February 2005. Since the start endpoint precision (day) is different than that of the end endpoint (month) the precision of the time interval at large is undefined.\n",
- " Example 5 ‘2004-02-01/2005’ is a time interval beginning sometime on February 1, 2004 and ending sometime in 2005. The start endpoint has calendar day precision and the end endpoint has calendar year precision. Similar to the previous example, the precision of the time interval at large is undefined.\n",
- " Example 6 ‘2005/2006-02’ is a time interval beginning sometime in 2005 and ending sometime in February 2006.\n",
+ " Example 1 1964/2008 is a time interval with calendar year precision, beginning sometime in 1964 and ending sometime in 2008.\n",
+ " Example 2 2004-06/2006-08 is a time interval with calendar month precision, beginning sometime in June 2004 and ending sometime in August of 2006.\n",
+ " Example 3 2004-02-01/2005-02-08 is a time interval with calendar day precision, beginning sometime on February 1, 2004 and ending sometime on February 8, 2005.\n",
+ " Example 4 2004-02-01/2005-02 is a time interval beginning sometime on February 1, 2004 and ending sometime in February 2005. Since the start endpoint precision (day) is different than that of the end endpoint (month) the precision of the time interval at large is undefined.\n",
+ " Example 5 2004-02-01/2005 is a time interval beginning sometime on February 1, 2004 and ending sometime in 2005. The start endpoint has calendar day precision and the end endpoint has calendar year precision. Similar to the previous example, the precision of the time interval at large is undefined.\n",
+ " Example 6 2005/2006-02 is a time interval beginning sometime in 2005 and ending sometime in February 2006.\n",
"```"
]
},
@@ -179,7 +179,7 @@
"assert isinstance(day_range, UndateInterval)\n",
"assert day_range.earliest == Undate(2004, 2, 1)\n",
"assert day_range.latest == Undate(2005, 2, 8)\n",
- "# Example 4 \n",
+ "# Example 4\n",
"day_month_range = Undate.parse(\"2004-02-01/2005-02\", \"EDTF\")\n",
"assert isinstance(day_range, UndateInterval)\n",
"assert day_month_range.earliest == Undate(2004, 2, 1)\n",
@@ -193,13 +193,13 @@
"assert day_year_range.latest == Undate(2005)\n",
"assert day_year_range.earliest.precision == DatePrecision.DAY\n",
"assert day_year_range.latest.precision == DatePrecision.YEAR\n",
- "# Example 6 \n",
+ "# Example 6\n",
"year_month_range = Undate.parse(\"2005/2006-02\", \"EDTF\")\n",
"assert isinstance(year_month_range, UndateInterval)\n",
"assert year_month_range.earliest == Undate(2005)\n",
"assert year_month_range.latest == Undate(2006, 2)\n",
"assert year_month_range.earliest.precision == DatePrecision.YEAR\n",
- "assert year_month_range.latest.precision == DatePrecision.MONTH\n"
+ "assert year_month_range.latest.precision == DatePrecision.MONTH"
]
},
{
@@ -220,14 +220,24 @@
"# Example 1\n",
"assert UndateInterval(Undate(1964), Undate(2008)).format(\"EDTF\") == \"1964/2008\"\n",
"# Example 2\n",
- "assert UndateInterval(Undate(2004, 6), Undate(2006, 8)).format(\"EDTF\") == \"2004-06/2006-08\"\n",
+ "assert (\n",
+ " UndateInterval(Undate(2004, 6), Undate(2006, 8)).format(\"EDTF\") == \"2004-06/2006-08\"\n",
+ ")\n",
"# Example 3\n",
- "assert UndateInterval(Undate(2004, 2, 1), Undate(2005, 2, 8)).format(\"EDTF\") == \"2004-02-01/2005-02-08\"\n",
- "# Example 4 \n",
- "assert UndateInterval(Undate(2004, 2, 1), Undate(2005, 2)).format(\"EDTF\") == \"2004-02-01/2005-02\"\n",
+ "assert (\n",
+ " UndateInterval(Undate(2004, 2, 1), Undate(2005, 2, 8)).format(\"EDTF\")\n",
+ " == \"2004-02-01/2005-02-08\"\n",
+ ")\n",
+ "# Example 4\n",
+ "assert (\n",
+ " UndateInterval(Undate(2004, 2, 1), Undate(2005, 2)).format(\"EDTF\")\n",
+ " == \"2004-02-01/2005-02\"\n",
+ ")\n",
"# Example 5\n",
- "assert UndateInterval(Undate(2004, 2, 1), Undate(2005)).format(\"EDTF\") == \"2004-02-01/2005\"\n",
- "# Example 6 \n",
+ "assert (\n",
+ " UndateInterval(Undate(2004, 2, 1), Undate(2005)).format(\"EDTF\") == \"2004-02-01/2005\"\n",
+ ")\n",
+ "# Example 6\n",
"assert UndateInterval(Undate(2005), Undate(2006, 2)).format(\"EDTF\") == \"2005/2006-02\""
]
},
@@ -248,8 +258,8 @@
"\n",
"'Y' may be used at the beginning of the date string to signify that the date is a year, when (and only when) the year exceeds four digits, i.e. for years later than 9999 or earlier than -9999.\n",
"```\n",
- " Example 1 'Y170000002' is the year 170000002\n",
- " Example 2 'Y-170000002' is the year -170000002\n",
+ " Example 1 Y170000002 is the year 170000002\n",
+ " Example 2 Y-170000002 is the year -170000002\n",
"```\n"
]
},
@@ -307,14 +317,14 @@
"The character 'X' may be used in place of one or more rightmost digits to indicate that the value of that digit is unspecified, for the following cases:\n",
"```\n",
" A year with one or two (rightmost) unspecified digits in a year-only expression (year precision)\n",
- " Example 1 ‘201X’\n",
- " Example 2 ‘20XX’\n",
+ " Example 1 201X\n",
+ " Example 2 20XX\n",
" Year specified, month unspecified in a year-month expression (month precision)\n",
- " Example 3 ‘2004-XX’\n",
+ " Example 3 2004-XX\n",
" Year and month specified, day unspecified in a year-month-day expression (day precision)\n",
- " Example 4 ‘1985-04-XX’ \n",
+ " Example 4 1985-04-XX\n",
" Year specified, day and month unspecified in a year-month-day expression (day precision)\n",
- " Example 5 ‘1985-XX-XX’ \n",
+ " Example 5 1985-XX-XX\n",
"```"
]
},
@@ -325,7 +335,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# Example 1 ‘201X’\n",
+ "# Example 1 201X\n",
"# parse\n",
"date = Undate.parse(\"201X\", \"EDTF\")\n",
"assert date.year == \"201X\"\n",
@@ -336,7 +346,7 @@
"# format\n",
"assert str(Undate(\"201X\")) == \"201X\"\n",
"\n",
- "# Example 2 ‘20XX’\n",
+ "# Example 2 20XX\n",
"# parse\n",
"date = Undate.parse(\"20XX\", \"EDTF\")\n",
"assert date.year == \"20XX\"\n",
@@ -347,7 +357,7 @@
"# format\n",
"assert str(Undate(\"20XX\")) == \"20XX\"\n",
"\n",
- "# Example 3 ‘2004-XX’\n",
+ "# Example 3 2004-XX\n",
"# parse\n",
"date = Undate.parse(\"2004-XX\", \"EDTF\")\n",
"assert date.year == \"2004\"\n",
@@ -359,7 +369,7 @@
"# format\n",
"assert str(Undate(2004, \"XX\")) == \"2004-XX\"\n",
"\n",
- "# Example 4 ‘1985-04-XX’ \n",
+ "# Example 4 1985-04-XX\n",
"# parse\n",
"date = Undate.parse(\"1985-04-XX\", \"EDTF\")\n",
"assert date.year == \"1985\"\n",
@@ -372,7 +382,7 @@
"# format\n",
"assert str(Undate(1985, 4, \"XX\")) == \"1985-04-XX\"\n",
"\n",
- "# Example 5 ‘1985-XX-XX’ \n",
+ "# Example 5 1985-XX-XX\n",
"# parse\n",
"date = Undate.parse(\"1985-XX-XX\", \"EDTF\")\n",
"assert date.year == \"1985\"\n",
@@ -384,7 +394,9 @@
"assert date.latest.month == 12\n",
"# earliest/latest possible days\n",
"assert date.earliest.day == 1\n",
- "assert date.latest.day == 31 # undate guesses maximum month length when month is unknown\n",
+ "assert (\n",
+ " date.latest.day == 31\n",
+ ") # undate guesses maximum month length when month is unknown\n",
"# format\n",
"assert str(Undate(1985, \"XX\", \"XX\")) == \"1985-XX-XX\""
]
@@ -415,11 +427,11 @@
"`undate` supports open ended time intervals, but does not currently distinguish between null string and double dot.\n",
"\n",
"\n",
- " Example 1 ‘1985-04-12/..’\n",
+ " Example 1 1985-04-12/..\n",
" interval starting at 1985 April 12th with day precision; end open\n",
- " Example 2 ‘1985-04/..’\n",
+ " Example 2 1985-04/..\n",
" interval starting at 1985 April with month precision; end open\n",
- " Example 3 ‘1985/..’\n",
+ " Example 3 1985/..\n",
" interval starting at year 1985 with year precision; end open\n"
]
},
@@ -432,7 +444,7 @@
"source": [
"import datetime\n",
"\n",
- "# Example 1 ‘1985-04-12/..’\n",
+ "# Example 1 1985-04-12/..\n",
"# parse\n",
"interval = Undate.parse(\"1985-04-12/..\", \"EDTF\")\n",
"assert isinstance(interval, UndateInterval)\n",
@@ -443,7 +455,7 @@
"# NOTE: undate interval does not currently distinguish between double dot and null string\n",
"assert str(UndateInterval(Undate(1985, 4, 12), None)) == \"1985-04-12/\"\n",
"\n",
- "# Example 2 ‘1985-04/..’\n",
+ "# Example 2 1985-04/..\n",
"# parse\n",
"interval = Undate.parse(\"1985-04/..\", \"EDTF\")\n",
"assert isinstance(interval, UndateInterval)\n",
@@ -453,7 +465,7 @@
"# format\n",
"assert str(UndateInterval(Undate(1985, 4), None)) == \"1985-04/\"\n",
"\n",
- "# Example 3 ‘1985/..’\n",
+ "# Example 3 1985/..\n",
"# parse\n",
"interval = Undate.parse(\"1985/..\", \"EDTF\")\n",
"assert isinstance(interval, UndateInterval)\n",
@@ -471,11 +483,11 @@
"source": [
"#### Open start time interval\n",
"\n",
- " Example 4 ‘../1985-04-12’\n",
+ " Example 4 ../1985-04-12\n",
" interval with open start; ending 1985 April 12th with day precision\n",
- " Example 5 ‘../1985-04’\n",
+ " Example 5 ../1985-04\n",
" interval with open start; ending 1985 April with month precision\n",
- " Example 6 ‘../1985’\n",
+ " Example 6 ../1985\n",
" interval with open start; ending at year 1985 with year precision"
]
},
@@ -486,7 +498,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# Example 4 ‘../1985-04-12’\n",
+ "# Example 4 ../1985-04-12\n",
"# parse\n",
"interval = Undate.parse(\"../1985-04-12\", \"EDTF\")\n",
"assert isinstance(interval, UndateInterval)\n",
@@ -497,7 +509,7 @@
"# NOTE: undate interval does not currently distinguish between double dot and null string\n",
"assert str(UndateInterval(None, Undate(1985, 4, 12))) == \"../1985-04-12\"\n",
"\n",
- "# Example 5 ‘../1985-04’\n",
+ "# Example 5 ../1985-04\n",
"# parse\n",
"interval = Undate.parse(\"../1985-04\", \"EDTF\")\n",
"assert isinstance(interval, UndateInterval)\n",
@@ -505,9 +517,17 @@
"assert interval.latest == Undate(1985, 4)\n",
"assert interval.latest.precision == DatePrecision.MONTH\n",
"# format\n",
- "assert str(UndateInterval(None, Undate(1985, 4), )) == \"../1985-04\"\n",
- "\n",
- "# Example 6 ‘../1985’\n",
+ "assert (\n",
+ " str(\n",
+ " UndateInterval(\n",
+ " None,\n",
+ " Undate(1985, 4),\n",
+ " )\n",
+ " )\n",
+ " == \"../1985-04\"\n",
+ ")\n",
+ "\n",
+ "# Example 6 ../1985\n",
"# parse\n",
"interval = Undate.parse(\"../1985\", \"EDTF\")\n",
"assert isinstance(interval, UndateInterval)\n",
@@ -525,11 +545,11 @@
"source": [
"#### Time interval with unknown end\n",
"\n",
- " Example 7 ‘1985-04-12/’\n",
+ " Example 7 1985-04-12/\n",
" interval starting 1985 April 12th with day precision; end unknown\n",
- " Example 8 ‘1985-04/’\n",
+ " Example 8 1985-04/\n",
" interval starting 1985 April with month precision; end unknown\n",
- " Example 9 ‘1985/’\n",
+ " Example 9 1985/\n",
" interval starting year 1985 with year precision; end unknown\n"
]
},
@@ -540,7 +560,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# Example 7 ‘1985-04-12/’\n",
+ "# Example 7 1985-04-12/\n",
"# parse\n",
"interval = Undate.parse(\"1985-04-12/\", \"EDTF\")\n",
"assert isinstance(interval, UndateInterval)\n",
@@ -551,7 +571,7 @@
"# NOTE: undate interval does not currently distinguish between double dot and null string\n",
"assert str(UndateInterval(Undate(1985, 4, 12), None)) == \"1985-04-12/\"\n",
"\n",
- "# Example 8 ‘1985-04/’\n",
+ "# Example 8 1985-04/\n",
"# parse\n",
"interval = Undate.parse(\"1985-04/\", \"EDTF\")\n",
"assert isinstance(interval, UndateInterval)\n",
@@ -561,7 +581,7 @@
"# format\n",
"assert str(UndateInterval(Undate(1985, 4), None)) == \"1985-04/\"\n",
"\n",
- "# Example 9 ‘1985/’\n",
+ "# Example 9 1985/\n",
"# parse\n",
"interval = Undate.parse(\"1985/\", \"EDTF\")\n",
"assert isinstance(interval, UndateInterval)\n",
@@ -579,11 +599,11 @@
"source": [
"#### Time interval with unknown start\n",
"\n",
- " Example 10 ‘/1985-04-12’\n",
+ " Example 10 /1985-04-12\n",
" interval with unknown start; ending 1985 April 12th with day precision\n",
- " Example 11 ‘/1985-04’\n",
+ " Example 11 /1985-04\n",
" interval with unknown start; ending 1985 April with month precision\n",
- " Example 12 ‘/1985’\n",
+ " Example 12 /1985\n",
" interval with unknown start; ending year 1985 with year precision\n"
]
},
@@ -594,7 +614,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# Example 10 ‘/1985-04-12’\n",
+ "# Example 10 /1985-04-12\n",
"# parse\n",
"interval = Undate.parse(\"/1985-04-12\", \"EDTF\")\n",
"assert isinstance(interval, UndateInterval)\n",
@@ -605,7 +625,7 @@
"# NOTE: undate interval does not currently distinguish between double dot and null string\n",
"assert str(UndateInterval(None, Undate(1985, 4, 12))) == \"../1985-04-12\"\n",
"\n",
- "# Example 11 ‘/1985-04’\n",
+ "# Example 11 /1985-04\n",
"# parse\n",
"interval = Undate.parse(\"/1985-04\", \"EDTF\")\n",
"assert isinstance(interval, UndateInterval)\n",
@@ -613,9 +633,17 @@
"assert interval.latest == Undate(1985, 4)\n",
"assert interval.latest.precision == DatePrecision.MONTH\n",
"# format\n",
- "assert str(UndateInterval(None, Undate(1985, 4), )) == \"../1985-04\"\n",
- "\n",
- "# Example 12 ‘/1985’\n",
+ "assert (\n",
+ " str(\n",
+ " UndateInterval(\n",
+ " None,\n",
+ " Undate(1985, 4),\n",
+ " )\n",
+ " )\n",
+ " == \"../1985-04\"\n",
+ ")\n",
+ "\n",
+ "# Example 12 /1985\n",
"# parse\n",
"interval = Undate.parse(\"/1985\", \"EDTF\")\n",
"assert isinstance(interval, UndateInterval)\n",
@@ -633,7 +661,7 @@
"source": [
"#### Negative calendar year\n",
"\n",
- " Example 1 ‘-1985’\n",
+ " Example 1 -1985\n",
"\n",
"Note: ISO 8601 Part 1 does not support negative year. "
]
@@ -645,7 +673,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# Example 1 ‘-1985’\n",
+ "# Example 1 -1985\n",
"# parse\n",
"neg_year = Undate.parse(\"-1985\", \"EDTF\")\n",
"assert neg_year.year == \"-1985\"\n",
@@ -672,17 +700,17 @@
"\n",
"For level 2 the unspecified digit, 'X', may occur anywhere within a component.\n",
"\n",
- " Example 1 ‘156X-12-25’\n",
+ " Example 1 156X-12-25\n",
" December 25 sometime during the 1560s\n",
- " Example 2 ‘15XX-12-25’\n",
+ " Example 2 15XX-12-25\n",
" December 25 sometime during the 1500s\n",
- " Example 3 ‘XXXX-12-XX’\n",
+ " Example 3 XXXX-12-XX\n",
" Some day in December in some year\n",
- " Example 4 '1XXX-XX’\n",
+ " Example 4 1XXX-XX\n",
" Some month during the 1000s\n",
- " Example 5 ‘1XXX-12’\n",
+ " Example 5 1XXX-12\n",
" Some December during the 1000s\n",
- " Example 6 ‘1984-1X’\n",
+ " Example 6 1984-1X\n",
" October, November, or December 1984"
]
},
@@ -693,7 +721,7 @@
"metadata": {},
"outputs": [],
"source": [
- "# Example 1 ‘156X-12-25’\n",
+ "# Example 1 156X-12-25\n",
"# parse\n",
"december = Undate.parse(\"156X-12-25\", \"EDTF\")\n",
"assert december.year == \"156X\"\n",
@@ -705,7 +733,7 @@
"# format\n",
"assert str(Undate(\"156X\", 12, 25)) == \"156X-12-25\"\n",
"\n",
- "# Example 2 ‘15XX-12-25’\n",
+ "# Example 2 15XX-12-25\n",
"# parse\n",
"december = Undate.parse(\"15XX-12-25\", \"EDTF\")\n",
"assert december.year == \"15XX\"\n",
@@ -717,7 +745,7 @@
"# format\n",
"assert str(Undate(\"15XX\", 12, 25)) == \"15XX-12-25\"\n",
"\n",
- "# Example 3 ‘XXXX-12-XX’\n",
+ "# Example 3 XXXX-12-XX\n",
"# parse\n",
"december = Undate.parse(\"XXXX-12-XX\", \"EDTF\")\n",
"assert december.year == \"XXXX\"\n",
@@ -732,7 +760,7 @@
"# format\n",
"assert str(Undate(\"XXXX\", 12, \"XX\")) == \"XXXX-12-XX\"\n",
"\n",
- "# Example 4 '1XXX-XX’\n",
+ "# Example 4 1XXX-XX\n",
"# parse\n",
"some_month = Undate.parse(\"1XXX-XX\", \"EDTF\")\n",
"assert some_month.year == \"1XXX\"\n",
@@ -743,7 +771,7 @@
"# format\n",
"assert str(Undate(\"1XXX\", \"XX\")) == \"1XXX-XX\"\n",
"\n",
- "# Example 5 ‘1XXX-12’\n",
+ "# Example 5 1XXX-12\n",
"# parse\n",
"some_december = Undate.parse(\"1XXX-12\", \"EDTF\")\n",
"assert some_december.year == \"1XXX\"\n",
@@ -754,7 +782,7 @@
"# format\n",
"assert str(Undate(\"1XXX\", 12)) == \"1XXX-12\"\n",
"\n",
- "# Example 6 ‘1984-1X’\n",
+ "# Example 6 1984-1X\n",
"# parse\n",
"late_1984 = Undate.parse(\"1984-1X\", \"EDTF\")\n",
"assert late_1984.year == \"1984\"\n",
diff --git a/examples/pgp_dates.ipynb b/examples/pgp_dates.ipynb
index 43a858c..65ece5e 100644
--- a/examples/pgp_dates.ipynb
+++ b/examples/pgp_dates.ipynb
@@ -34,20 +34,31 @@
},
{
"cell_type": "code",
- "execution_count": 4,
+ "execution_count": 5,
"id": "67c5532d-ebc4-4e1e-aa64-e6802ed1d971",
"metadata": {},
- "outputs": [],
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "/var/folders/mb/6qm4h4yx3yqdy2bv2sjyp4z00000gp/T/ipykernel_78526/1738353942.py:6: DtypeWarning: Columns (31) have mixed types. Specify dtype option on import or set low_memory=False.\n",
+ " documents = pd.read_csv(pgp_documents_csv)\n"
+ ]
+ }
+ ],
"source": [
"import pandas as pd\n",
"\n",
- "pgp_documents_csv = \"https://github.com/princetongenizalab/pgp-metadata/raw/main/data/documents.csv\"\n",
+ "pgp_documents_csv = (\n",
+ " \"https://github.com/princetongenizalab/pgp-metadata/raw/main/data/documents.csv\"\n",
+ ")\n",
"documents = pd.read_csv(pgp_documents_csv)"
]
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 6,
"id": "41dc5a05-a04b-4b6d-acfe-1f7b04849346",
"metadata": {},
"outputs": [
@@ -56,16 +67,18 @@
"output_type": "stream",
"text": [
"\n",
- "Total documents: 35,187\n",
- "Documents with dates: 4,451\n",
- " date on document: 4,126\n",
- " inferred dating: 331\n"
+ "Total documents: 35,938\n",
+ "Documents with dates: 6,737\n",
+ " date on document: 4,729\n",
+ " inferred dating: 2,040\n"
]
}
],
"source": [
"# limit to documents with dates\n",
- "docs_with_dates = documents[documents.doc_date_standard.notna() | documents.inferred_date_standard.notna()]\n",
+ "docs_with_dates = documents[\n",
+ " documents.doc_date_standard.notna() | documents.inferred_date_standard.notna()\n",
+ "]\n",
"docs_with_docdate = documents[documents.doc_date_standard.notna()].copy()\n",
"docs_with_inferreddate = documents[documents.inferred_date_standard.notna()]\n",
"\n",
@@ -78,7 +91,7 @@
},
{
"cell_type": "code",
- "execution_count": 6,
+ "execution_count": 7,
"id": "94d6340b-10d0-461b-b745-378ffa1ffcec",
"metadata": {},
"outputs": [
@@ -115,7 +128,14 @@
"
449 | \n",
" 1570 | \n",
" Seleucid | \n",
- " 1259 | \n",
+ " 1258-08-31/1259-09-19 | \n",
+ " \n",
+ " \n",
+ " | 15 | \n",
+ " 462 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " 1056-06 | \n",
"
\n",
" \n",
" | 16 | \n",
@@ -173,13 +193,6 @@
" Seleucid | \n",
" 1130-10-06/1130-10-15 | \n",
"
\n",
- " \n",
- " | 61 | \n",
- " 524 | \n",
- " Thursday, 12 Sivan 4795 | \n",
- " Anno Mundi | \n",
- " 1035-05-22 | \n",
- "
\n",
" \n",
"\n",
""
@@ -187,6 +200,7 @@
"text/plain": [
" pgpid doc_date_original doc_date_calendar \\\n",
"5 449 1570 Seleucid \n",
+ "15 462 NaN NaN \n",
"16 463 19 Adar 1427 Seleucid \n",
"17 464 Tammuz 1288 Seleucid \n",
"23 472 1337 Seleucid \n",
@@ -195,10 +209,10 @@
"43 502 Tevet 1548 Seleucid \n",
"47 506 Elul 1428 Seleucid \n",
"55 516 First decade of Ḥeshvan 1442 Seleucid \n",
- "61 524 Thursday, 12 Sivan 4795 Anno Mundi \n",
"\n",
" doc_date_standard \n",
- "5 1259 \n",
+ "5 1258-08-31/1259-09-19 \n",
+ "15 1056-06 \n",
"16 1116-03-05 \n",
"17 0977-06-21/0977-07-19 \n",
"23 1025-08-28/1026-09-14 \n",
@@ -206,17 +220,18 @@
"41 1188-12-07 \n",
"43 1236-11-30/1236-12-28 \n",
"47 1117-08-01/1117-08-29 \n",
- "55 1130-10-06/1130-10-15 \n",
- "61 1035-05-22 "
+ "55 1130-10-06/1130-10-15 "
]
},
- "execution_count": 6,
+ "execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "docs_with_docdate[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']].head(10)"
+ "docs_with_docdate[\n",
+ " [\"pgpid\", \"doc_date_original\", \"doc_date_calendar\", \"doc_date_standard\"]\n",
+ "].head(10)"
]
},
{
@@ -231,7 +246,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 8,
"id": "b9703b47-a7e2-4178-a7da-fb47db11b5b7",
"metadata": {},
"outputs": [
@@ -252,27 +267,30 @@
"from lark.visitors import VisitError\n",
"\n",
"# first, how far can we get with the standard dates? can we parse as edtf and sort, render?\n",
- "from undate import Undate \n",
+ "from undate import Undate\n",
+ "\n",
"\n",
"def parse_standard_date(value):\n",
" try:\n",
" return Undate.parse(value, \"EDTF\")\n",
" except VisitError as err:\n",
" print(f\"Parse error on {value}: {err}\")\n",
- " \n",
+ "\n",
"\n",
"# ignore gregorian/julian distinction for now\n",
"# from pgp code:\n",
"# Julian Thursday, 4 October 1582, being followed by Gregorian Friday, 15 October\n",
"# cut off between gregorian/julian dates, in julian days\n",
- "#gregorian_start_jd = convertdate.julianday.from_julian(1582, 10, 5)\n",
+ "# gregorian_start_jd = convertdate.julianday.from_julian(1582, 10, 5)\n",
"\n",
- "docs_with_docdate['undate_standard'] = docs_with_docdate.doc_date_standard.apply(parse_standard_date)"
+ "docs_with_docdate[\"undate_standard\"] = docs_with_docdate.doc_date_standard.apply(\n",
+ " parse_standard_date\n",
+ ")"
]
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 9,
"id": "f49e82a4-b05b-4395-998f-0c9e75729e9f",
"metadata": {},
"outputs": [
@@ -306,7 +324,7 @@
" \n",
" \n",
" \n",
- " | 3190 | \n",
+ " 3181 | \n",
" 3957 | \n",
" middle decade of Adar 1528 | \n",
" Seleucid | \n",
@@ -314,7 +332,7 @@
" 2025-04-12 20:45:36.603800+00:00 | \n",
"
\n",
" \n",
- " | 34437 | \n",
+ " 34293 | \n",
" 40006 | \n",
" NaN | \n",
" NaN | \n",
@@ -327,15 +345,15 @@
],
"text/plain": [
" pgpid doc_date_original doc_date_calendar \\\n",
- "3190 3957 middle decade of Adar 1528 Seleucid \n",
- "34437 40006 NaN NaN \n",
+ "3181 3957 middle decade of Adar 1528 Seleucid \n",
+ "34293 40006 NaN NaN \n",
"\n",
" doc_date_standard last_modified \n",
- "3190 1217-02-20/1217-02-29 2025-04-12 20:45:36.603800+00:00 \n",
- "34437 1747-02-29 2024-08-07 18:24:19.425288+00:00 "
+ "3181 1217-02-20/1217-02-29 2025-04-12 20:45:36.603800+00:00 \n",
+ "34293 1747-02-29 2024-08-07 18:24:19.425288+00:00 "
]
},
- "execution_count": 8,
+ "execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
@@ -345,7 +363,15 @@
"\n",
"# this is probably a data error in the original\n",
"\n",
- "docs_with_docdate[docs_with_docdate.undate_standard.isna()][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'last_modified']]"
+ "docs_with_docdate[docs_with_docdate.undate_standard.isna()][\n",
+ " [\n",
+ " \"pgpid\",\n",
+ " \"doc_date_original\",\n",
+ " \"doc_date_calendar\",\n",
+ " \"doc_date_standard\",\n",
+ " \"last_modified\",\n",
+ " ]\n",
+ "]"
]
},
{
@@ -358,7 +384,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 10,
"id": "2d502575-a2b4-4fce-9f59-6932275dfac2",
"metadata": {},
"outputs": [
@@ -366,14 +392,14 @@
"data": {
"text/plain": [
"doc_date_calendar\n",
- "Seleucid 1604\n",
- "Anno Mundi 1147\n",
- "Hijrī 884\n",
+ "Seleucid 1794\n",
+ "Anno Mundi 1399\n",
+ "Hijrī 1063\n",
"Kharājī 8\n",
"Name: count, dtype: int64"
]
},
- "execution_count": 9,
+ "execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
@@ -384,7 +410,7 @@
},
{
"cell_type": "code",
- "execution_count": 10,
+ "execution_count": 11,
"id": "04e4ffb2-13e7-49cc-913b-2104b61aef16",
"metadata": {},
"outputs": [
@@ -424,6 +450,13 @@
" 1035-05-22 | \n",
"
\n",
" \n",
+ " | 70 | \n",
+ " 534 | \n",
+ " 3 Adar II 4845 | \n",
+ " Anno Mundi | \n",
+ " 1085-03-02 | \n",
+ "
\n",
+ " \n",
" | 90 | \n",
" 561 | \n",
" 10 Nisan 4716 | \n",
@@ -452,6 +485,13 @@
" 1044-08-27/1045-09-13 | \n",
"
\n",
" \n",
+ " | 174 | \n",
+ " 657 | \n",
+ " [Tammuz] 4831 | \n",
+ " Anno Mundi | \n",
+ " 1071-06-02/1071-06-30 | \n",
+ "
\n",
+ " \n",
" | 177 | \n",
" 660 | \n",
" 22 Sivan 4974 | \n",
@@ -472,20 +512,6 @@
" Anno Mundi | \n",
" 1051-08-18 | \n",
"
\n",
- " \n",
- " | 255 | \n",
- " 750 | \n",
- " Friday, 24 Ḥeshvan 4765 | \n",
- " Anno Mundi | \n",
- " 1004-11-10 | \n",
- "
\n",
- " \n",
- " | 264 | \n",
- " 760 | \n",
- " Thursday, 11 Av 4783 | \n",
- " Anno Mundi | \n",
- " 1023-08-01 | \n",
- "
\n",
" \n",
"\n",
""
@@ -493,25 +519,27 @@
"text/plain": [
" pgpid doc_date_original doc_date_calendar doc_date_standard\n",
"61 524 Thursday, 12 Sivan 4795 Anno Mundi 1035-05-22\n",
+ "70 534 3 Adar II 4845 Anno Mundi 1085-03-02\n",
"90 561 10 Nisan 4716 Anno Mundi 0956-03-24\n",
"111 582 Thursday, 6 Adar 4996 Anno Mundi 1236-02-14\n",
"119 591 Sunday, 29 Tammuz 4898 Anno Mundi 1138-07-10\n",
"131 603 4805/4806 Anno Mundi 1044-08-27/1045-09-13\n",
+ "174 657 [Tammuz] 4831 Anno Mundi 1071-06-02/1071-06-30\n",
"177 660 22 Sivan 4974 Anno Mundi 1214-06-01\n",
"207 695 Friday, [25] Nisan [4810] Anno Mundi 1050-04-20\n",
- "215 703 8 Elul (4)811 Anno Mundi 1051-08-18\n",
- "255 750 Friday, 24 Ḥeshvan 4765 Anno Mundi 1004-11-10\n",
- "264 760 Thursday, 11 Av 4783 Anno Mundi 1023-08-01"
+ "215 703 8 Elul (4)811 Anno Mundi 1051-08-18"
]
},
- "execution_count": 10,
+ "execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# example hebrew dates\n",
- "docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']].head(10)"
+ "docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][\n",
+ " [\"pgpid\", \"doc_date_original\", \"doc_date_calendar\", \"doc_date_standard\"]\n",
+ "].head(10)"
]
},
{
@@ -530,7 +558,7 @@
},
{
"cell_type": "code",
- "execution_count": 11,
+ "execution_count": 12,
"id": "4d11e583-7c80-44ed-80b1-d0c5b7b7f408",
"metadata": {},
"outputs": [
@@ -538,8 +566,8 @@
"name": "stderr",
"output_type": "stream",
"text": [
- "/var/folders/mb/6qm4h4yx3yqdy2bv2sjyp4z00000gp/T/ipykernel_38072/1200615794.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
- " hebrew_dates = docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][docs_with_docdate.doc_date_original.notna()]\n"
+ "/var/folders/mb/6qm4h4yx3yqdy2bv2sjyp4z00000gp/T/ipykernel_78526/2303123184.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.\n",
+ " hebrew_dates = docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][\n"
]
},
{
@@ -571,56 +599,69 @@
" \n",
" \n",
" \n",
- " | 702 | \n",
+ " 698 | \n",
" 1223 | \n",
" Wednesday, 9 Tammuz 4912 AM | \n",
" Anno Mundi | \n",
" 1152-06-13 | \n",
"
\n",
" \n",
- " | 16698 | \n",
+ " 16600 | \n",
" 19975 | \n",
" Sunday, 10 Kislev 5583 AM | \n",
" Anno Mundi | \n",
" 1822-11-24 | \n",
"
\n",
" \n",
- " | 25415 | \n",
+ " 25299 | \n",
" 30550 | \n",
" Tammuz 5537 AM | \n",
" Anno Mundi | \n",
" 1777-07-06/1777-08-03 | \n",
"
\n",
+ " \n",
+ " | 35805 | \n",
+ " 41550 | \n",
+ " 3 Av 5325 AM | \n",
+ " Anno Mundi | \n",
+ " 1565-07-01 | \n",
+ "
\n",
" \n",
"\n",
""
],
"text/plain": [
" pgpid doc_date_original doc_date_calendar \\\n",
- "702 1223 Wednesday, 9 Tammuz 4912 AM Anno Mundi \n",
- "16698 19975 Sunday, 10 Kislev 5583 AM Anno Mundi \n",
- "25415 30550 Tammuz 5537 AM Anno Mundi \n",
+ "698 1223 Wednesday, 9 Tammuz 4912 AM Anno Mundi \n",
+ "16600 19975 Sunday, 10 Kislev 5583 AM Anno Mundi \n",
+ "25299 30550 Tammuz 5537 AM Anno Mundi \n",
+ "35805 41550 3 Av 5325 AM Anno Mundi \n",
"\n",
" doc_date_standard \n",
- "702 1152-06-13 \n",
- "16698 1822-11-24 \n",
- "25415 1777-07-06/1777-08-03 "
+ "698 1152-06-13 \n",
+ "16600 1822-11-24 \n",
+ "25299 1777-07-06/1777-08-03 \n",
+ "35805 1565-07-01 "
]
},
- "execution_count": 11,
+ "execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# how many end with AM ?\n",
- "hebrew_dates = docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][docs_with_docdate.doc_date_original.notna()]\n",
- "hebrew_dates[hebrew_dates.doc_date_original.str.endswith(\"AM\")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]"
+ "hebrew_dates = docs_with_docdate[docs_with_docdate.doc_date_calendar == \"Anno Mundi\"][\n",
+ " docs_with_docdate.doc_date_original.notna()\n",
+ "]\n",
+ "hebrew_dates[hebrew_dates.doc_date_original.str.endswith(\"AM\")][\n",
+ " [\"pgpid\", \"doc_date_original\", \"doc_date_calendar\", \"doc_date_standard\"]\n",
+ "]"
]
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 13,
"id": "cd1a751a-5299-418f-a3f8-050ab0384354",
"metadata": {},
"outputs": [
@@ -653,41 +694,41 @@
" \n",
" \n",
" \n",
- " | 1556 | \n",
+ " 1120 | \n",
+ " 1699 | \n",
+ " Adar 52[..] | \n",
+ " Anno Mundi | \n",
+ " 1440-02-05/1539-02-18 | \n",
+ "
\n",
+ " \n",
+ " | 1132 | \n",
+ " 1711 | \n",
+ " 9 Av 13[.]8 | \n",
+ " Seleucid | \n",
+ " 0997-07-17/1087-07-13 | \n",
+ "
\n",
+ " \n",
+ " | 1551 | \n",
" 2163 | \n",
" first third of Tammuz 500[.] | \n",
" Anno Mundi | \n",
" 1244/1249 | \n",
"
\n",
" \n",
- " | 1567 | \n",
+ " 1562 | \n",
" 2175 | \n",
" End of Sivan 152[.] | \n",
" Seleucid | \n",
" 1209/1218 | \n",
"
\n",
" \n",
- " | 1753 | \n",
+ " 1748 | \n",
" 2460 | \n",
" 13[..] | \n",
" Seleucid | \n",
" 988/1088 | \n",
"
\n",
" \n",
- " | 2018 | \n",
- " 2745 | \n",
- " 1[.] Kislev 48[..] | \n",
- " Anno Mundi | \n",
- " 1039-11-30/1138-11-24 | \n",
- "
\n",
- " \n",
- " | 3044 | \n",
- " 3805 | \n",
- " 13[..] | \n",
- " Seleucid | \n",
- " 988/1087 | \n",
- "
\n",
- " \n",
" | ... | \n",
" ... | \n",
" ... | \n",
@@ -695,35 +736,35 @@
" ... | \n",
"
\n",
" \n",
- " | 30589 | \n",
- " 35955 | \n",
- " 12 Muḥarram 52[.] | \n",
- " Hijrī | \n",
- " 1126/1134 | \n",
- "
\n",
- " \n",
- " | 31226 | \n",
- " 36738 | \n",
- " 54[.] | \n",
- " Hijrī | \n",
- " 1145/1154 | \n",
- "
\n",
- " \n",
- " | 32548 | \n",
+ " 32412 | \n",
" 38077 | \n",
" 14[...] | \n",
" Seleucid | \n",
" 1088-09-19/1188-09-23 | \n",
"
\n",
" \n",
- " | 34652 | \n",
+ " 32804 | \n",
+ " 38478 | \n",
+ " 19 Tevet 47[..] | \n",
+ " Anno Mundi | \n",
+ " 0940-01-02/1038-12-19 | \n",
+ "
\n",
+ " \n",
+ " | 34173 | \n",
+ " 39886 | \n",
+ " 4[.]4 | \n",
+ " Hijrī | \n",
+ " 1023/1101 | \n",
+ "
\n",
+ " \n",
+ " | 34503 | \n",
" 40226 | \n",
" 49[.] | \n",
" Hijrī | \n",
" 1096-12-19/1106-09-01 | \n",
"
\n",
" \n",
- " | 34760 | \n",
+ " 34611 | \n",
" 40335 | \n",
" [4]82[.] | \n",
" Anno Mundi | \n",
@@ -731,52 +772,55 @@
"
\n",
" \n",
"\n",
- "66 rows × 4 columns
\n",
+ "115 rows × 4 columns
\n",
""
],
"text/plain": [
" pgpid doc_date_original doc_date_calendar \\\n",
- "1556 2163 first third of Tammuz 500[.] Anno Mundi \n",
- "1567 2175 End of Sivan 152[.] Seleucid \n",
- "1753 2460 13[..] Seleucid \n",
- "2018 2745 1[.] Kislev 48[..] Anno Mundi \n",
- "3044 3805 13[..] Seleucid \n",
+ "1120 1699 Adar 52[..] Anno Mundi \n",
+ "1132 1711 9 Av 13[.]8 Seleucid \n",
+ "1551 2163 first third of Tammuz 500[.] Anno Mundi \n",
+ "1562 2175 End of Sivan 152[.] Seleucid \n",
+ "1748 2460 13[..] Seleucid \n",
"... ... ... ... \n",
- "30589 35955 12 Muḥarram 52[.] Hijrī \n",
- "31226 36738 54[.] Hijrī \n",
- "32548 38077 14[...] Seleucid \n",
- "34652 40226 49[.] Hijrī \n",
- "34760 40335 [4]82[.] Anno Mundi \n",
+ "32412 38077 14[...] Seleucid \n",
+ "32804 38478 19 Tevet 47[..] Anno Mundi \n",
+ "34173 39886 4[.]4 Hijrī \n",
+ "34503 40226 49[.] Hijrī \n",
+ "34611 40335 [4]82[.] Anno Mundi \n",
"\n",
" doc_date_standard \n",
- "1556 1244/1249 \n",
- "1567 1209/1218 \n",
- "1753 988/1088 \n",
- "2018 1039-11-30/1138-11-24 \n",
- "3044 988/1087 \n",
+ "1120 1440-02-05/1539-02-18 \n",
+ "1132 0997-07-17/1087-07-13 \n",
+ "1551 1244/1249 \n",
+ "1562 1209/1218 \n",
+ "1748 988/1088 \n",
"... ... \n",
- "30589 1126/1134 \n",
- "31226 1145/1154 \n",
- "32548 1088-09-19/1188-09-23 \n",
- "34652 1096-12-19/1106-09-01 \n",
- "34760 1059-09-11/1069-09-18 \n",
+ "32412 1088-09-19/1188-09-23 \n",
+ "32804 0940-01-02/1038-12-19 \n",
+ "34173 1023/1101 \n",
+ "34503 1096-12-19/1106-09-01 \n",
+ "34611 1059-09-11/1069-09-18 \n",
"\n",
- "[66 rows x 4 columns]"
+ "[115 rows x 4 columns]"
]
},
- "execution_count": 12,
+ "execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# how many include periods?\n",
- "docs_with_docdate[docs_with_docdate.doc_date_original.notna() & docs_with_docdate.doc_date_original.str.contains(\"\\\\.\")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']]"
+ "docs_with_docdate[\n",
+ " docs_with_docdate.doc_date_original.notna()\n",
+ " & docs_with_docdate.doc_date_original.str.contains(\"\\\\.\")\n",
+ "][[\"pgpid\", \"doc_date_original\", \"doc_date_calendar\", \"doc_date_standard\"]]"
]
},
{
"cell_type": "code",
- "execution_count": 13,
+ "execution_count": 14,
"id": "9fa8d2ba-6612-4de5-8741-dea177f99412",
"metadata": {},
"outputs": [
@@ -809,74 +853,74 @@
" \n",
" \n",
" \n",
- " | 635 | \n",
+ " 631 | \n",
" 1154 | \n",
" Last decade of Kislev 5004 | \n",
" Anno Mundi | \n",
" 1243-12 | \n",
"
\n",
" \n",
- " | 1172 | \n",
+ " 1168 | \n",
" 1750 | \n",
" 11th Tammuz 4767 | \n",
" Anno Mundi | \n",
" 1007 | \n",
"
\n",
" \n",
- " | 1173 | \n",
+ " 1169 | \n",
" 1751 | \n",
" Monday, 27th Ṭevet 4797 | \n",
" Anno Mundi | \n",
" 1037-01-23 | \n",
"
\n",
" \n",
- " | 1556 | \n",
+ " 1551 | \n",
" 2163 | \n",
" first third of Tammuz 500[.] | \n",
" Anno Mundi | \n",
" 1244/1249 | \n",
"
\n",
" \n",
- " | 5142 | \n",
+ " 5126 | \n",
" 6795 | \n",
" last decade of Tishrei 4991 | \n",
" Anno Mundi | \n",
" 1230-09-29/1230-10-08 | \n",
"
\n",
" \n",
- " | 5223 | \n",
+ " 5207 | \n",
" 6892 | \n",
" last decade of Iyyar 4906 | \n",
" Anno Mundi | \n",
" 1146-05-04/1146-05-13 | \n",
"
\n",
" \n",
- " | 5664 | \n",
+ " 5646 | \n",
" 7409 | \n",
" last third of Ḥeshvan 4965 | \n",
" Anno Mundi | \n",
" 1204-10-17/1204-10-25 | \n",
"
\n",
" \n",
- " | 5812 | \n",
+ " 5794 | \n",
" 7581 | \n",
" middle third of Adar 4876 | \n",
" Anno Mundi | \n",
" 1116-05 | \n",
"
\n",
" \n",
- " | 7024 | \n",
+ " 7003 | \n",
" 9068 | \n",
" Last decade of Ṭevet 4898 | \n",
" Anno Mundi | \n",
" 1138-01 | \n",
"
\n",
" \n",
- " | 8638 | \n",
- " 11215 | \n",
- " Middle third of Av 4889 | \n",
+ " 7049 | \n",
+ " 9120 | \n",
+ " Sunday, 5th of Kislev | \n",
" Anno Mundi | \n",
- " 1129-07-29/1129-08-07 | \n",
+ " 1140-11-17 | \n",
"
\n",
" \n",
"\n",
@@ -884,43 +928,47 @@
],
"text/plain": [
" pgpid doc_date_original doc_date_calendar \\\n",
- "635 1154 Last decade of Kislev 5004 Anno Mundi \n",
- "1172 1750 11th Tammuz 4767 Anno Mundi \n",
- "1173 1751 Monday, 27th Ṭevet 4797 Anno Mundi \n",
- "1556 2163 first third of Tammuz 500[.] Anno Mundi \n",
- "5142 6795 last decade of Tishrei 4991 Anno Mundi \n",
- "5223 6892 last decade of Iyyar 4906 Anno Mundi \n",
- "5664 7409 last third of Ḥeshvan 4965 Anno Mundi \n",
- "5812 7581 middle third of Adar 4876 Anno Mundi \n",
- "7024 9068 Last decade of Ṭevet 4898 Anno Mundi \n",
- "8638 11215 Middle third of Av 4889 Anno Mundi \n",
+ "631 1154 Last decade of Kislev 5004 Anno Mundi \n",
+ "1168 1750 11th Tammuz 4767 Anno Mundi \n",
+ "1169 1751 Monday, 27th Ṭevet 4797 Anno Mundi \n",
+ "1551 2163 first third of Tammuz 500[.] Anno Mundi \n",
+ "5126 6795 last decade of Tishrei 4991 Anno Mundi \n",
+ "5207 6892 last decade of Iyyar 4906 Anno Mundi \n",
+ "5646 7409 last third of Ḥeshvan 4965 Anno Mundi \n",
+ "5794 7581 middle third of Adar 4876 Anno Mundi \n",
+ "7003 9068 Last decade of Ṭevet 4898 Anno Mundi \n",
+ "7049 9120 Sunday, 5th of Kislev Anno Mundi \n",
"\n",
" doc_date_standard \n",
- "635 1243-12 \n",
- "1172 1007 \n",
- "1173 1037-01-23 \n",
- "1556 1244/1249 \n",
- "5142 1230-09-29/1230-10-08 \n",
- "5223 1146-05-04/1146-05-13 \n",
- "5664 1204-10-17/1204-10-25 \n",
- "5812 1116-05 \n",
- "7024 1138-01 \n",
- "8638 1129-07-29/1129-08-07 "
+ "631 1243-12 \n",
+ "1168 1007 \n",
+ "1169 1037-01-23 \n",
+ "1551 1244/1249 \n",
+ "5126 1230-09-29/1230-10-08 \n",
+ "5207 1146-05-04/1146-05-13 \n",
+ "5646 1204-10-17/1204-10-25 \n",
+ "5794 1116-05 \n",
+ "7003 1138-01 \n",
+ "7049 1140-11-17 "
]
},
- "execution_count": 13,
+ "execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# how many use ordinals instead of numerals?\n",
- "hebrew_dates[hebrew_dates.doc_date_original.str.contains(\"st\") | hebrew_dates.doc_date_original.str.contains(\"rd\") | hebrew_dates.doc_date_original.str.contains(\"th\")][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard']].head(10)"
+ "hebrew_dates[\n",
+ " hebrew_dates.doc_date_original.str.contains(\"st\")\n",
+ " | hebrew_dates.doc_date_original.str.contains(\"rd\")\n",
+ " | hebrew_dates.doc_date_original.str.contains(\"th\")\n",
+ "][[\"pgpid\", \"doc_date_original\", \"doc_date_calendar\", \"doc_date_standard\"]].head(10)"
]
},
{
"cell_type": "code",
- "execution_count": 14,
+ "execution_count": 15,
"id": "5b6d5811-fe81-471d-bd29-896cec4c98ff",
"metadata": {},
"outputs": [
@@ -937,12 +985,14 @@
"source": [
"import re\n",
"\n",
+ "\n",
"def remove_ordinals(val):\n",
- " return re.sub(r'(\\d+)(st|nd|rd|th)', \"\\\\1\", val)\n",
+ " return re.sub(r\"(\\d+)(st|nd|rd|th)\", \"\\\\1\", val)\n",
+ "\n",
"\n",
"# test removing ordinals without removing the numbers\n",
- "for val in ['11th Tammuz 4767', \"27th Tevet\", \"8th Kislev\"]:\n",
- " print(f\"{val}: { remove_ordinals(val)}\")"
+ "for val in [\"11th Tammuz 4767\", \"27th Tevet\", \"8th Kislev\"]:\n",
+ " print(f\"{val}: {remove_ordinals(val)}\")"
]
},
{
@@ -956,7 +1006,7 @@
},
{
"cell_type": "code",
- "execution_count": 15,
+ "execution_count": 16,
"id": "798da8f2-2332-48c2-aeec-214474e9d49c",
"metadata": {},
"outputs": [],
@@ -966,7 +1016,7 @@
"from lark.exceptions import UnexpectedEOF\n",
"\n",
"# set this to True to see details about parsing\n",
- "VERBOSE_PARSE_OUTPUT = False \n",
+ "VERBOSE_PARSE_OUTPUT = False\n",
"\n",
"\n",
"def parse_original_date(row):\n",
@@ -980,27 +1030,40 @@
" # handle seleucid as hebrew with offset (adapt from pgp code)\n",
" undate_calendar = \"Seleucid\"\n",
"\n",
- " \n",
" if undate_calendar:\n",
" value = row.doc_date_original\n",
"\n",
" # some dates have unknown digits, e.g. 1[.] Kislev 48[..] or 152[.]\n",
" # ... the calendar parser don't support this, even though Undate does support unknown digits\n",
" # in future, perhaps we can add missing digit logic with this syntax to share across appropriate parsers\n",
- " if '[.' in value:\n",
+ " if \"[.\" in value:\n",
" if VERBOSE_PARSE_OUTPUT:\n",
" print(f\"ignoring missing digits for now {value}\")\n",
- " value = value.replace(\"[.]\", \"0\").replace(\"[..]\", \"00\").replace(\"[...]\", \"000\") \n",
- " \n",
+ " value = (\n",
+ " value.replace(\"[.]\", \"0\").replace(\"[..]\", \"00\").replace(\"[...]\", \"000\")\n",
+ " )\n",
+ "\n",
" # some dates have inferred numbers, e.g. Friday, [25] Nisan [4810] or 8 Elul (4)811'\n",
- " # for now, just strip out brackets before parsing; \n",
+ " # for now, just strip out brackets before parsing;\n",
" # in future, could potentially infer uncertainty based on these\n",
- " value = value.replace('[', '').replace(']', '').replace('(', '').replace(')', '')\n",
+ " value = (\n",
+ " value.replace(\"[\", \"\").replace(\"]\", \"\").replace(\"(\", \"\").replace(\")\", \"\")\n",
+ " )\n",
"\n",
" # for now, remove modifiers that are not supported by undate parser:\n",
" # Late Tevet 4903, Last decade of Kislev 5004, first third of ...\n",
" # some dates include of, e.g. day of month\n",
- " modifiers = [\"Late \", \"(first|middle|last)( third|half|decade|tenth)? (of )?\", \"(Beginning|end) of \", \"last day\", \"First 10 days\", \" of\", \"spring\", \"decade \", \"night, \"]\n",
+ " modifiers = [\n",
+ " \"Late \",\n",
+ " \"(first|middle|last)( third|half|decade|tenth)? (of )?\",\n",
+ " \"(Beginning|end) of \",\n",
+ " \"last day\",\n",
+ " \"First 10 days\",\n",
+ " \" of\",\n",
+ " \"spring\",\n",
+ " \"decade \",\n",
+ " \"night, \",\n",
+ " ]\n",
" for mod in modifiers:\n",
" value = re.sub(mod, \"\", value, flags=re.I)\n",
"\n",
@@ -1017,12 +1080,14 @@
"\n",
" # about 62 have ordinals; strip them out\n",
" value = remove_ordinals(value)\n",
- " \n",
+ "\n",
" try:\n",
" return Undate.parse(value, undate_calendar)\n",
" except (VisitError, ValueError, UnexpectedEOF) as err:\n",
" if VERBOSE_PARSE_OUTPUT:\n",
- " print(f\"Parse error on PGPID {row.pgpid} {value} ({undate_calendar}): {err}\")\n",
+ " print(\n",
+ " f\"Parse error on PGPID {row.pgpid} {value} ({undate_calendar}): {err}\"\n",
+ " )\n",
"\n",
" # there are a handful of cases in PGP where calendars are mixed,\n",
" # i.e. hebrew months used for hijri calendar\n",
@@ -1034,13 +1099,16 @@
" if parsed:\n",
" parsed = parsed.as_calendar(undate_calendar)\n",
" if VERBOSE_PARSE_OUTPUT:\n",
- " print(f\"parsed {value} with ISO8601 format and calendar {undate_calendar}, result is {parsed} ({parsed.earliest}/{parsed.latest})\")\n",
+ " print(\n",
+ " f\"parsed {value} with ISO8601 format and calendar {undate_calendar}, result is {parsed} ({parsed.earliest}/{parsed.latest})\"\n",
+ " )\n",
" return parsed\n",
" except ValueError as err:\n",
" if VERBOSE_PARSE_OUTPUT:\n",
" print(f\"Could not parse {value} as ISO date: {err}\")\n",
"\n",
- "docs_with_docdate['undate_orig'] = docs_with_docdate.apply(parse_original_date, axis=1)"
+ "\n",
+ "docs_with_docdate[\"undate_orig\"] = docs_with_docdate.apply(parse_original_date, axis=1)"
]
},
{
@@ -1055,7 +1123,7 @@
},
{
"cell_type": "code",
- "execution_count": 16,
+ "execution_count": 17,
"id": "623eb160-ab6c-44ba-b3f4-6770c2c7bd86",
"metadata": {},
"outputs": [
@@ -1063,21 +1131,25 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "original dates parsed: 3462\n",
- "original dates unparsed: 173 (anno mundi, hijri, and seleucid calendars)\n",
- "proportion parsed: 95.24%\n"
+ "original dates parsed: 4058\n",
+ "original dates unparsed: 198 (anno mundi, hijri, and seleucid calendars)\n",
+ "proportion parsed: 95.35%\n"
]
}
],
"source": [
"orig_dates_parsed = docs_with_docdate[docs_with_docdate.undate_orig.notna()].copy()\n",
- "orig_dates_unparsed = docs_with_docdate[docs_with_docdate.doc_date_original.notna() & docs_with_docdate.doc_date_calendar.isin(['Anno Mundi', 'Hijrī', 'Seleucid']) & docs_with_docdate.undate_orig.isna()] \n",
+ "orig_dates_unparsed = docs_with_docdate[\n",
+ " docs_with_docdate.doc_date_original.notna()\n",
+ " & docs_with_docdate.doc_date_calendar.isin([\"Anno Mundi\", \"Hijrī\", \"Seleucid\"])\n",
+ " & docs_with_docdate.undate_orig.isna()\n",
+ "]\n",
"\n",
"total_parsed = len(orig_dates_parsed)\n",
"total_unparsed = len(orig_dates_unparsed)\n",
"print(f\"\"\"original dates parsed: {total_parsed}\n",
"original dates unparsed: {total_unparsed} (anno mundi, hijri, and seleucid calendars)\n",
- "proportion parsed: {(total_parsed/(total_parsed + total_unparsed))*100:0.2f}%\"\"\")"
+ "proportion parsed: {(total_parsed / (total_parsed + total_unparsed)) * 100:0.2f}%\"\"\")"
]
},
{
@@ -1092,7 +1164,7 @@
},
{
"cell_type": "code",
- "execution_count": 17,
+ "execution_count": 18,
"id": "42945787-6788-422d-9a04-f983ec6b31af",
"metadata": {},
"outputs": [
@@ -1132,8 +1204,8 @@
" 449 | \n",
" 1570 | \n",
" Seleucid | \n",
- " 1259 | \n",
- " 1259 | \n",
+ " 1258-08-31/1259-09-19 | \n",
+ " 1258-08-31/1259-09-19 | \n",
" 1570 | \n",
" year | \n",
" \n",
@@ -1183,34 +1255,46 @@
],
"text/plain": [
" pgpid doc_date_original doc_date_calendar doc_date_standard \\\n",
- "5 449 1570 Seleucid 1259 \n",
+ "5 449 1570 Seleucid 1258-08-31/1259-09-19 \n",
"16 463 19 Adar 1427 Seleucid 1116-03-05 \n",
"17 464 Tammuz 1288 Seleucid 0977-06-21/0977-07-19 \n",
"23 472 1337 Seleucid 1025-08-28/1026-09-14 \n",
"41 499 Wednesday, 15 Kislev 1500 Seleucid 1188-12-07 \n",
"\n",
" undate_standard undate_orig orig_date_precision \n",
- "5 1259 1570 year \n",
+ "5 1258-08-31/1259-09-19 1570 year \n",
"16 1116-03-05 1427-12-19 day \n",
"17 0977-06-21/0977-07-19 1288-04 month \n",
"23 1025-08-28/1026-09-14 1337 year \n",
"41 1188-12-07 1500-09-15 day "
]
},
- "execution_count": 17,
+ "execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# determine original date precision based on parsed undate\n",
- "orig_dates_parsed['orig_date_precision'] = orig_dates_parsed.undate_orig.apply(lambda x: str(x.precision).lower())\n",
- "orig_dates_parsed[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate_standard', 'undate_orig', 'orig_date_precision']].head()"
+ "orig_dates_parsed[\"orig_date_precision\"] = orig_dates_parsed.undate_orig.apply(\n",
+ " lambda x: str(x.precision).lower()\n",
+ ")\n",
+ "orig_dates_parsed[\n",
+ " [\n",
+ " \"pgpid\",\n",
+ " \"doc_date_original\",\n",
+ " \"doc_date_calendar\",\n",
+ " \"doc_date_standard\",\n",
+ " \"undate_standard\",\n",
+ " \"undate_orig\",\n",
+ " \"orig_date_precision\",\n",
+ " ]\n",
+ "].head()"
]
},
{
"cell_type": "code",
- "execution_count": 18,
+ "execution_count": 19,
"id": "88f1d3ab-e1c7-48b5-8907-5aeea463f1e8",
"metadata": {},
"outputs": [
@@ -1218,13 +1302,13 @@
"data": {
"text/plain": [
"orig_date_precision\n",
- "day 1599\n",
- "month 1027\n",
- "year 836\n",
+ "day 1947\n",
+ "month 1178\n",
+ "year 933\n",
"Name: count, dtype: int64"
]
},
- "execution_count": 18,
+ "execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
@@ -1246,7 +1330,7 @@
},
{
"cell_type": "code",
- "execution_count": 19,
+ "execution_count": 20,
"id": "5d3a55b0-ed36-47ba-b022-848bb128b449",
"metadata": {},
"outputs": [
@@ -1289,7 +1373,7 @@
" Seleucid | \n",
" 1570 | \n",
" year | \n",
- " 1259 | \n",
+ " 1258-08-31/1259-09-19 | \n",
" 1258-09-07 | \n",
" 1259-09-26 | \n",
" \n",
@@ -1371,27 +1455,27 @@
" 1130-11-10 | \n",
" \n",
" \n",
+ " | 56 | \n",
+ " 517 | \n",
+ " Elul 1351 | \n",
+ " Seleucid | \n",
+ " 1351-06 | \n",
+ " month | \n",
+ " 1040-08-13/1040-09-10 | \n",
+ " 1040-08-19 | \n",
+ " 1040-09-16 | \n",
+ "
\n",
+ " \n",
" | 73 | \n",
" 537 | \n",
" Ḥeshvan 1453 | \n",
" Seleucid | \n",
" 1453-08 | \n",
" month | \n",
- " 1141 | \n",
+ " 1141-10-04/1141-11-01 | \n",
" 1141-10-11 | \n",
" 1141-11-08 | \n",
"
\n",
- " \n",
- " | 75 | \n",
- " 544 | \n",
- " Sunday, 21 Kislev 1355 | \n",
- " Seleucid | \n",
- " 1355-09-21 | \n",
- " day | \n",
- " 1043-11-26 | \n",
- " 1043-12-02 | \n",
- " 1043-12-02 | \n",
- "
\n",
" \n",
"\n",
""
@@ -1406,11 +1490,11 @@
"43 502 Tevet 1548 Seleucid 1548-10 \n",
"47 506 Elul 1428 Seleucid 1428-06 \n",
"55 516 First decade of Ḥeshvan 1442 Seleucid 1442-08 \n",
+ "56 517 Elul 1351 Seleucid 1351-06 \n",
"73 537 Ḥeshvan 1453 Seleucid 1453-08 \n",
- "75 544 Sunday, 21 Kislev 1355 Seleucid 1355-09-21 \n",
"\n",
" orig_date_precision doc_date_standard undate_earliest undate_latest \n",
- "5 year 1259 1258-09-07 1259-09-26 \n",
+ "5 year 1258-08-31/1259-09-19 1258-09-07 1259-09-26 \n",
"16 day 1116-03-05 1116-03-12 1116-03-12 \n",
"17 month 0977-06-21/0977-07-19 0977-06-26 0977-07-24 \n",
"23 year 1025-08-28/1026-09-14 1025-09-03 1026-09-20 \n",
@@ -1418,35 +1502,49 @@
"43 month 1236-11-30/1236-12-28 1236-12-07 1237-01-04 \n",
"47 month 1117-08-01/1117-08-29 1117-08-08 1117-09-05 \n",
"55 month 1130-10-06/1130-10-15 1130-10-13 1130-11-10 \n",
- "73 month 1141 1141-10-11 1141-11-08 \n",
- "75 day 1043-11-26 1043-12-02 1043-12-02 "
+ "56 month 1040-08-13/1040-09-10 1040-08-19 1040-09-16 \n",
+ "73 month 1141-10-04/1141-11-01 1141-10-11 1141-11-08 "
]
},
- "execution_count": 19,
+ "execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "seleucid_dates = orig_dates_parsed[orig_dates_parsed.doc_date_calendar == 'Seleucid'].copy()\n",
- "# add undate earliest/latest (Gregorian) for comparison with dataset standardized date \n",
- "seleucid_dates['undate_earliest'] = seleucid_dates.undate_orig.apply(lambda x: x.earliest)\n",
- "seleucid_dates['undate_latest'] = seleucid_dates.undate_orig.apply(lambda x: x.latest)\n",
+ "seleucid_dates = orig_dates_parsed[\n",
+ " orig_dates_parsed.doc_date_calendar == \"Seleucid\"\n",
+ "].copy()\n",
+ "# add undate earliest/latest (Gregorian) for comparison with dataset standardized date\n",
+ "seleucid_dates[\"undate_earliest\"] = seleucid_dates.undate_orig.apply(\n",
+ " lambda x: x.earliest\n",
+ ")\n",
+ "seleucid_dates[\"undate_latest\"] = seleucid_dates.undate_orig.apply(lambda x: x.latest)\n",
"\n",
- "seleucid_dates[['pgpid', 'doc_date_original', 'doc_date_calendar', 'undate_orig', 'orig_date_precision', 'doc_date_standard', 'undate_earliest', 'undate_latest']].head(10)\n",
- " "
+ "seleucid_dates[\n",
+ " [\n",
+ " \"pgpid\",\n",
+ " \"doc_date_original\",\n",
+ " \"doc_date_calendar\",\n",
+ " \"undate_orig\",\n",
+ " \"orig_date_precision\",\n",
+ " \"doc_date_standard\",\n",
+ " \"undate_earliest\",\n",
+ " \"undate_latest\",\n",
+ " ]\n",
+ "].head(10)"
]
},
{
"cell_type": "code",
- "execution_count": 20,
+ "execution_count": 21,
"id": "a104d772-6c2c-4711-91ec-8cf1f108ae23",
"metadata": {},
"outputs": [],
"source": [
- "# can we sort by parsed original dates? \n",
+ "# can we sort by parsed original dates?\n",
"# doesn't work currently because of overlapping dates / different granularity\n",
- "#orig_dates_parsed.sort_values(by='undate_orig') #, key=lambda col: col.value.earliest)"
+ "# orig_dates_parsed.sort_values(by='undate_orig') #, key=lambda col: col.value.earliest)"
]
},
{
@@ -1463,7 +1561,7 @@
},
{
"cell_type": "code",
- "execution_count": 21,
+ "execution_count": 22,
"id": "c653d928-8fec-4ddc-9abf-ace2f7ca6629",
"metadata": {},
"outputs": [],
@@ -1472,14 +1570,20 @@
"\n",
"# NOTE: we have to cast type to something pandas/altair supports\n",
"\n",
- "orig_dates_parsed['orig_date_earliest'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest).astype('datetime64[s]')\n",
- "orig_dates_parsed['orig_date_latest'] = orig_dates_parsed.undate_orig.apply(lambda x: x.latest).astype('datetime64[s]')\n",
- "orig_dates_parsed['orig_date_mid'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest + (x.latest - x.earliest)/2).astype('datetime64[s]')"
+ "orig_dates_parsed[\"orig_date_earliest\"] = orig_dates_parsed.undate_orig.apply(\n",
+ " lambda x: x.earliest\n",
+ ").astype(\"datetime64[s]\")\n",
+ "orig_dates_parsed[\"orig_date_latest\"] = orig_dates_parsed.undate_orig.apply(\n",
+ " lambda x: x.latest\n",
+ ").astype(\"datetime64[s]\")\n",
+ "orig_dates_parsed[\"orig_date_mid\"] = orig_dates_parsed.undate_orig.apply(\n",
+ " lambda x: x.earliest + (x.latest - x.earliest) / 2\n",
+ ").astype(\"datetime64[s]\")"
]
},
{
"cell_type": "code",
- "execution_count": 22,
+ "execution_count": 23,
"id": "91f155fe-d0e6-4ee4-99de-698ac301e3f3",
"metadata": {},
"outputs": [
@@ -1577,6 +1681,14 @@
" Seleucid | \n",
" \n",
" \n",
+ " | 56 | \n",
+ " 1040-08-19 | \n",
+ " 1040-09-16 | \n",
+ " 1040-09-02 | \n",
+ " 517 | \n",
+ " Seleucid | \n",
+ "
\n",
+ " \n",
" | 61 | \n",
" 1035-05-28 | \n",
" 1035-05-28 | \n",
@@ -1584,14 +1696,6 @@
" 524 | \n",
" Anno Mundi | \n",
"
\n",
- " \n",
- " | 62 | \n",
- " 1034-08-25 | \n",
- " 1034-09-22 | \n",
- " 1034-09-08 | \n",
- " 525 | \n",
- " Hijrī | \n",
- "
\n",
" \n",
"\n",
""
@@ -1606,22 +1710,30 @@
"43 1236-12-07 1237-01-04 1236-12-21 502 Seleucid\n",
"47 1117-08-08 1117-09-05 1117-08-22 506 Seleucid\n",
"55 1130-10-13 1130-11-10 1130-10-27 516 Seleucid\n",
- "61 1035-05-28 1035-05-28 1035-05-28 524 Anno Mundi\n",
- "62 1034-08-25 1034-09-22 1034-09-08 525 Hijrī"
+ "56 1040-08-19 1040-09-16 1040-09-02 517 Seleucid\n",
+ "61 1035-05-28 1035-05-28 1035-05-28 524 Anno Mundi"
]
},
- "execution_count": 22,
+ "execution_count": 23,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "orig_dates_parsed[['orig_date_earliest', 'orig_date_latest', 'orig_date_mid', 'pgpid', 'doc_date_calendar']].head(10)"
+ "orig_dates_parsed[\n",
+ " [\n",
+ " \"orig_date_earliest\",\n",
+ " \"orig_date_latest\",\n",
+ " \"orig_date_mid\",\n",
+ " \"pgpid\",\n",
+ " \"doc_date_calendar\",\n",
+ " ]\n",
+ "].head(10)"
]
},
{
"cell_type": "code",
- "execution_count": 23,
+ "execution_count": 24,
"id": "144b2a4a-81cf-4a6d-a277-3a7910354a77",
"metadata": {},
"outputs": [
@@ -1630,23 +1742,23 @@
"text/html": [
"\n",
"\n",
- "\n",
+ "\n",
""
],
"text/plain": [
"alt.Chart(...)"
]
},
- "execution_count": 23,
+ "execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
@@ -1712,14 +1824,24 @@
"\n",
"date_docs_cal = orig_dates_parsed[orig_dates_parsed.doc_date_standard.notna()]\n",
"\n",
- "dated_docs_cal = date_docs_cal.fillna({'doc_date_calendar': 'Unspecified'})\n",
- "dated_docs_cal['midpoint_year'] = dated_docs_cal.orig_date_mid.apply(lambda x: x.year)\n",
+ "dated_docs_cal = date_docs_cal.fillna({\"doc_date_calendar\": \"Unspecified\"})\n",
+ "dated_docs_cal[\"midpoint_year\"] = dated_docs_cal.orig_date_mid.apply(lambda x: x.year)\n",
"\n",
- "orig_dates_calendars_chart = alt.Chart(dated_docs_cal[['pgpid', 'midpoint_year', 'doc_date_calendar']]).mark_area(opacity=0.7).encode(\n",
- " x=alt.X('midpoint_year', title=\"Year (midpoint)\", bin=alt.Bin(maxbins=120), axis=alt.Axis(format=\"r\")),\n",
- " y=alt.Y('count(pgpid)', title='Documents'),\n",
- " color=alt.Y(\"doc_date_calendar\", title=\"Calendar\")\n",
- ").properties(width=900, height=200, title=\"Documents by calendar (original date)\")\n",
+ "orig_dates_calendars_chart = (\n",
+ " alt.Chart(dated_docs_cal[[\"pgpid\", \"midpoint_year\", \"doc_date_calendar\"]])\n",
+ " .mark_area(opacity=0.7)\n",
+ " .encode(\n",
+ " x=alt.X(\n",
+ " \"midpoint_year\",\n",
+ " title=\"Year (midpoint)\",\n",
+ " bin=alt.Bin(maxbins=120),\n",
+ " axis=alt.Axis(format=\"r\"),\n",
+ " ),\n",
+ " y=alt.Y(\"count(pgpid)\", title=\"Documents\"),\n",
+ " color=alt.Y(\"doc_date_calendar\", title=\"Calendar\"),\n",
+ " )\n",
+ " .properties(width=900, height=200, title=\"Documents by calendar (original date)\")\n",
+ ")\n",
"\n",
"orig_dates_calendars_chart"
]
@@ -1734,7 +1856,7 @@
},
{
"cell_type": "code",
- "execution_count": 24,
+ "execution_count": 25,
"id": "4acc9a2b-d403-4f93-b2c5-6fee92ead105",
"metadata": {},
"outputs": [
@@ -1743,23 +1865,23 @@
"text/html": [
"\n",
"\n",
- "\n",
+ "\n",
""
],
"text/plain": [
"alt.Chart(...)"
]
},
- "execution_count": 24,
+ "execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
@@ -1822,29 +1944,46 @@
"source": [
"# graph documents with calendars\n",
"\n",
+ "\n",
"def undate_midpoint(value):\n",
" # parsed standard date could be an undate or an interval; handle either\n",
" if isinstance(value, Undate):\n",
" earliest = value.earliest\n",
" latest = value.latest\n",
- " else: # interval\n",
+ " else: # interval\n",
" earliest = value.earliest.earliest\n",
" latest = value.latest.latest\n",
- " return earliest + (latest - earliest)/2\n",
- " \n",
+ " return earliest + (latest - earliest) / 2\n",
+ "\n",
"\n",
"dated_docs_cal = docs_with_docdate.copy()\n",
- "dated_docs_cal = dated_docs_cal.fillna({'doc_date_calendar': 'Unspecified'})\n",
+ "dated_docs_cal = dated_docs_cal.fillna({\"doc_date_calendar\": \"Unspecified\"})\n",
"# get the midpoint from the parsed standard date; convert to supported type\n",
- "dated_docs_cal['midpoint'] = dated_docs_cal.undate_standard.apply(lambda x: undate_midpoint(x) if pd.notna(x) else None).astype(\"datetime64[s]\")\n",
- "dated_docs_cal['midpoint_year'] = dated_docs_cal.midpoint.apply(lambda x: x.year if pd.notna(x) else None)\n",
+ "dated_docs_cal[\"midpoint\"] = dated_docs_cal.undate_standard.apply(\n",
+ " lambda x: undate_midpoint(x) if pd.notna(x) else None\n",
+ ").astype(\"datetime64[s]\")\n",
+ "dated_docs_cal[\"midpoint_year\"] = dated_docs_cal.midpoint.apply(\n",
+ " lambda x: x.year if pd.notna(x) else None\n",
+ ")\n",
"\n",
"\n",
- "std_dates_calendars_chart = alt.Chart(dated_docs_cal[['pgpid', 'midpoint_year', 'doc_date_calendar']]).mark_area(opacity=0.7).encode(\n",
- " x=alt.X('midpoint_year', title=\"Year\", bin=alt.Bin(maxbins=120), axis=alt.Axis(format=\"r\")),\n",
- " y=alt.Y('count(pgpid)', title='Documents'),\n",
- " color=alt.Y(\"doc_date_calendar\", title=\"Calendar\").scale(domain=['Anno Mundi', 'Hijrī', 'Seleucid', 'Kharājī', 'Unspecified'])\n",
- ").properties(width=900, height=200, title=\"Documents by calendar (standard date)\")\n",
+ "std_dates_calendars_chart = (\n",
+ " alt.Chart(dated_docs_cal[[\"pgpid\", \"midpoint_year\", \"doc_date_calendar\"]])\n",
+ " .mark_area(opacity=0.7)\n",
+ " .encode(\n",
+ " x=alt.X(\n",
+ " \"midpoint_year\",\n",
+ " title=\"Year\",\n",
+ " bin=alt.Bin(maxbins=120),\n",
+ " axis=alt.Axis(format=\"r\"),\n",
+ " ),\n",
+ " y=alt.Y(\"count(pgpid)\", title=\"Documents\"),\n",
+ " color=alt.Y(\"doc_date_calendar\", title=\"Calendar\").scale(\n",
+ " domain=[\"Anno Mundi\", \"Hijrī\", \"Seleucid\", \"Kharājī\", \"Unspecified\"]\n",
+ " ),\n",
+ " )\n",
+ " .properties(width=900, height=200, title=\"Documents by calendar (standard date)\")\n",
+ ")\n",
"\n",
"std_dates_calendars_chart"
]
@@ -1859,7 +1998,7 @@
},
{
"cell_type": "code",
- "execution_count": 25,
+ "execution_count": 26,
"id": "4d7c4d5f-636c-42a0-a906-21c67f5781b8",
"metadata": {},
"outputs": [
@@ -1868,23 +2007,23 @@
"text/html": [
"\n",
"\n",
- "\n",
+ "\n",
""
],
"text/plain": [
"alt.VConcatChart(...)"
]
},
- "execution_count": 25,
+ "execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
@@ -1958,7 +2097,7 @@
},
{
"cell_type": "code",
- "execution_count": 26,
+ "execution_count": 27,
"id": "c5861110-dbd5-4d7a-8ada-acf7cb871aa7",
"metadata": {},
"outputs": [
@@ -1967,23 +2106,23 @@
"text/html": [
"\n",
"\n",
- "\n",
+ "\n",
""
],
"text/plain": [
"alt.VConcatChart(...)"
]
},
- "execution_count": 26,
+ "execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "graphable_data = orig_dates_parsed[['orig_date_earliest', 'orig_date_latest', 'orig_date_mid', 'pgpid', 'doc_date_calendar']].copy()\n",
+ "graphable_data = orig_dates_parsed[\n",
+ " [\n",
+ " \"orig_date_earliest\",\n",
+ " \"orig_date_latest\",\n",
+ " \"orig_date_mid\",\n",
+ " \"pgpid\",\n",
+ " \"doc_date_calendar\",\n",
+ " ]\n",
+ "].copy()\n",
"# graphable_data['midpoint'] = graphable_data.undate_standard.apply(lambda x: undate_midpoint(x) if pd.notna(x) else None).astype(\"datetime64[s]\")\n",
- "graphable_data['midpoint_year'] = graphable_data.orig_date_mid.apply(lambda x: x.year if pd.notna(x) else None)\n",
+ "graphable_data[\"midpoint_year\"] = graphable_data.orig_date_mid.apply(\n",
+ " lambda x: x.year if pd.notna(x) else None\n",
+ ")\n",
"\n",
"\n",
- "bar_chart = alt.Chart(graphable_data).mark_bar(opacity=0.5).encode(\n",
- " x=alt.X('orig_date_earliest:T', title=\"original date (range)\"), # , axis=alt.Axis(format=\"r\")),\n",
- " x2='orig_date_latest:T',\n",
- " y=alt.Y('count(pgpid)', title='Count of Documents')\n",
- ").properties(width=1200, height=150)\n",
+ "bar_chart = (\n",
+ " alt.Chart(graphable_data)\n",
+ " .mark_bar(opacity=0.5)\n",
+ " .encode(\n",
+ " x=alt.X(\n",
+ " \"orig_date_earliest:T\", title=\"original date (range)\"\n",
+ " ), # , axis=alt.Axis(format=\"r\")),\n",
+ " x2=\"orig_date_latest:T\",\n",
+ " y=alt.Y(\"count(pgpid)\", title=\"Count of Documents\"),\n",
+ " )\n",
+ " .properties(width=1200, height=150)\n",
+ ")\n",
"\n",
- "line_chart = alt.Chart(graphable_data).mark_line(opacity=0.6, color=\"green\", interpolate=\"monotone\").encode(\n",
- " x=alt.X('orig_date_mid:T', title=\"Year (midpoint)\"),\n",
- " y=alt.Y('count(pgpid)', title='Documents')\n",
- ").properties(width=1200, height=150)\n",
+ "line_chart = (\n",
+ " alt.Chart(graphable_data)\n",
+ " .mark_line(opacity=0.6, color=\"green\", interpolate=\"monotone\")\n",
+ " .encode(\n",
+ " x=alt.X(\"orig_date_mid:T\", title=\"Year (midpoint)\"),\n",
+ " y=alt.Y(\"count(pgpid)\", title=\"Documents\"),\n",
+ " )\n",
+ " .properties(width=1200, height=150)\n",
+ ")\n",
"\n",
"(bar_chart & line_chart).properties(title=\"Documents by date (1000-1300)\").interactive()"
]
@@ -2075,7 +2236,7 @@
},
{
"cell_type": "code",
- "execution_count": 27,
+ "execution_count": 28,
"id": "3122a874-bb17-429f-993f-4bf7a76c1a36",
"metadata": {},
"outputs": [
@@ -2112,7 +2273,7 @@
" \n",
" \n",
" \n",
- " | 851 | \n",
+ " 847 | \n",
" 1377 | \n",
" Wednesday night, 28 Sivan 1581 | \n",
" Seleucid | \n",
@@ -2123,7 +2284,7 @@
" Legal document | \n",
"
\n",
" \n",
- " | 1714 | \n",
+ " 1709 | \n",
" 2418 | \n",
" Monday 20 Tevet 1520 | \n",
" Seleucid | \n",
@@ -2134,7 +2295,7 @@
" Legal document | \n",
"
\n",
" \n",
- " | 1929 | \n",
+ " 1923 | \n",
" 2649 | \n",
" Sunday night, 25 Kislev 1444 | \n",
" Seleucid | \n",
@@ -2145,7 +2306,7 @@
" Legal document | \n",
"
\n",
" \n",
- " | 2013 | \n",
+ " 2007 | \n",
" 2739 | \n",
" Wednesday 29th Elul 1354 | \n",
" Seleucid | \n",
@@ -2156,7 +2317,7 @@
" Legal document | \n",
"
\n",
" \n",
- " | 3257 | \n",
+ " 3248 | \n",
" 4026 | \n",
" Wednesday night, 29 Tishrei 1541 | \n",
" Seleucid | \n",
@@ -2178,7 +2339,7 @@
" ... | \n",
"
\n",
" \n",
- " | 29303 | \n",
+ " 29175 | \n",
" 34623 | \n",
" Sunday night, 20 Ṭevet 1578 | \n",
" Seleucid | \n",
@@ -2189,7 +2350,7 @@
" Legal document | \n",
"
\n",
" \n",
- " | 29924 | \n",
+ " 29792 | \n",
" 35264 | \n",
" Wednesday 13 Ṭevet 1526 | \n",
" Seleucid | \n",
@@ -2200,7 +2361,7 @@
" Legal document | \n",
"
\n",
" \n",
- " | 34008 | \n",
+ " 33867 | \n",
" 39564 | \n",
" Monday 16 Tevet 1339 | \n",
" Seleucid | \n",
@@ -2211,7 +2372,7 @@
" Legal document | \n",
"
\n",
" \n",
- " | 34466 | \n",
+ " 34322 | \n",
" 40035 | \n",
" Monday 1st Iyyar 1437 | \n",
" Seleucid | \n",
@@ -2222,7 +2383,7 @@
" Legal document | \n",
"
\n",
" \n",
- " | 34467 | \n",
+ " 34323 | \n",
" 40036 | \n",
" Friday 15 of Adar 1443 | \n",
" Seleucid | \n",
@@ -2234,59 +2395,72 @@
"
\n",
" \n",
"\n",
- "104 rows × 8 columns
\n",
+ "106 rows × 8 columns
\n",
""
],
"text/plain": [
" pgpid doc_date_original doc_date_calendar \\\n",
- "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n",
- "1714 2418 Monday 20 Tevet 1520 Seleucid \n",
- "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n",
- "2013 2739 Wednesday 29th Elul 1354 Seleucid \n",
- "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n",
+ "847 1377 Wednesday night, 28 Sivan 1581 Seleucid \n",
+ "1709 2418 Monday 20 Tevet 1520 Seleucid \n",
+ "1923 2649 Sunday night, 25 Kislev 1444 Seleucid \n",
+ "2007 2739 Wednesday 29th Elul 1354 Seleucid \n",
+ "3248 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n",
"... ... ... ... \n",
- "29303 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n",
- "29924 35264 Wednesday 13 Ṭevet 1526 Seleucid \n",
- "34008 39564 Monday 16 Tevet 1339 Seleucid \n",
- "34466 40035 Monday 1st Iyyar 1437 Seleucid \n",
- "34467 40036 Friday 15 of Adar 1443 Seleucid \n",
+ "29175 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n",
+ "29792 35264 Wednesday 13 Ṭevet 1526 Seleucid \n",
+ "33867 39564 Monday 16 Tevet 1339 Seleucid \n",
+ "34322 40035 Monday 1st Iyyar 1437 Seleucid \n",
+ "34323 40036 Friday 15 of Adar 1443 Seleucid \n",
"\n",
" doc_date_standard undate_standard undate_orig orig_date_precision \\\n",
- "851 1270 1270 1581-03-28 day \n",
- "1714 1208-12-29 1208-12-29 1520-10-20 day \n",
- "1929 1133 1133 1444-09-25 day \n",
- "2013 1043-09-07 1043-09-07 1354-06-29 day \n",
- "3257 1229-09-18 1229-09-18 1541-07-29 day \n",
+ "847 1270 1270 1581-03-28 day \n",
+ "1709 1208-12-29 1208-12-29 1520-10-20 day \n",
+ "1923 1133 1133 1444-09-25 day \n",
+ "2007 1043-09-07 1043-09-07 1354-06-29 day \n",
+ "3248 1229-09-18 1229-09-18 1541-07-29 day \n",
"... ... ... ... ... \n",
- "29303 1266/1267 1266/1267 1578-10-20 day \n",
- "29924 1214/1215 1214/1215 1526-10-13 day \n",
- "34008 1027-12-18 1027-12-18 1339-10-16 day \n",
- "34466 1126-04-26 1126-04-26 1437-02-01 day \n",
- "34467 1132-03-04 1132-03-04 1443-12-15 day \n",
+ "29175 1266/1267 1266/1267 1578-10-20 day \n",
+ "29792 1214/1215 1214/1215 1526-10-13 day \n",
+ "33867 1027-12-18 1027-12-18 1339-10-16 day \n",
+ "34322 1126-04-26 1126-04-26 1437-02-01 day \n",
+ "34323 1132-03-04 1132-03-04 1443-12-15 day \n",
"\n",
" type \n",
- "851 Legal document \n",
- "1714 Legal document \n",
- "1929 Legal document \n",
- "2013 Legal document \n",
- "3257 Legal document \n",
+ "847 Legal document \n",
+ "1709 Legal document \n",
+ "1923 Legal document \n",
+ "2007 Legal document \n",
+ "3248 Legal document \n",
"... ... \n",
- "29303 Legal document \n",
- "29924 Legal document \n",
- "34008 Legal document \n",
- "34466 Legal document \n",
- "34467 Legal document \n",
+ "29175 Legal document \n",
+ "29792 Legal document \n",
+ "33867 Legal document \n",
+ "34322 Legal document \n",
+ "34323 Legal document \n",
"\n",
- "[104 rows x 8 columns]"
+ "[106 rows x 8 columns]"
]
},
- "execution_count": 27,
+ "execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "weekday_dates = orig_dates_parsed[orig_dates_parsed.doc_date_original.str.contains('day ')][['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate_standard', 'undate_orig', 'orig_date_precision', 'type']]\n",
+ "weekday_dates = orig_dates_parsed[\n",
+ " orig_dates_parsed.doc_date_original.str.contains(\"day \")\n",
+ "][\n",
+ " [\n",
+ " \"pgpid\",\n",
+ " \"doc_date_original\",\n",
+ " \"doc_date_calendar\",\n",
+ " \"doc_date_standard\",\n",
+ " \"undate_standard\",\n",
+ " \"undate_orig\",\n",
+ " \"orig_date_precision\",\n",
+ " \"type\",\n",
+ " ]\n",
+ "]\n",
"weekday_dates"
]
},
@@ -2302,7 +2476,7 @@
},
{
"cell_type": "code",
- "execution_count": 28,
+ "execution_count": 29,
"id": "3e4ea50c-b11c-433b-b6f9-691098b057d3",
"metadata": {},
"outputs": [
@@ -2342,7 +2516,7 @@
" \n",
" \n",
" \n",
- " | 851 | \n",
+ " 847 | \n",
" 1377 | \n",
" Wednesday night, 28 Sivan 1581 | \n",
" Seleucid | \n",
@@ -2356,7 +2530,7 @@
" Thursday | \n",
"
\n",
" \n",
- " | 1714 | \n",
+ " 1709 | \n",
" 2418 | \n",
" Monday 20 Tevet 1520 | \n",
" Seleucid | \n",
@@ -2370,7 +2544,7 @@
" Monday | \n",
"
\n",
" \n",
- " | 1929 | \n",
+ " 1923 | \n",
" 2649 | \n",
" Sunday night, 25 Kislev 1444 | \n",
" Seleucid | \n",
@@ -2384,7 +2558,7 @@
" Monday | \n",
"
\n",
" \n",
- " | 2013 | \n",
+ " 2007 | \n",
" 2739 | \n",
" Wednesday 29th Elul 1354 | \n",
" Seleucid | \n",
@@ -2398,7 +2572,7 @@
" Wednesday | \n",
"
\n",
" \n",
- " | 3257 | \n",
+ " 3248 | \n",
" 4026 | \n",
" Wednesday night, 29 Tishrei 1541 | \n",
" Seleucid | \n",
@@ -2426,7 +2600,7 @@
" ... | \n",
"
\n",
" \n",
- " | 29303 | \n",
+ " 29175 | \n",
" 34623 | \n",
" Sunday night, 20 Ṭevet 1578 | \n",
" Seleucid | \n",
@@ -2440,7 +2614,7 @@
" Monday | \n",
"
\n",
" \n",
- " | 29924 | \n",
+ " 29792 | \n",
" 35264 | \n",
" Wednesday 13 Ṭevet 1526 | \n",
" Seleucid | \n",
@@ -2454,7 +2628,7 @@
" Wednesday | \n",
"
\n",
" \n",
- " | 34008 | \n",
+ " 33867 | \n",
" 39564 | \n",
" Monday 16 Tevet 1339 | \n",
" Seleucid | \n",
@@ -2468,7 +2642,7 @@
" Monday | \n",
"
\n",
" \n",
- " | 34466 | \n",
+ " 34322 | \n",
" 40035 | \n",
" Monday 1st Iyyar 1437 | \n",
" Seleucid | \n",
@@ -2482,7 +2656,7 @@
" Monday | \n",
"
\n",
" \n",
- " | 34467 | \n",
+ " 34323 | \n",
" 40036 | \n",
" Friday 15 of Adar 1443 | \n",
" Seleucid | \n",
@@ -2497,53 +2671,53 @@
"
\n",
" \n",
"\n",
- "104 rows × 11 columns
\n",
+ "106 rows × 11 columns
\n",
""
],
"text/plain": [
" pgpid doc_date_original doc_date_calendar \\\n",
- "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n",
- "1714 2418 Monday 20 Tevet 1520 Seleucid \n",
- "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n",
- "2013 2739 Wednesday 29th Elul 1354 Seleucid \n",
- "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n",
+ "847 1377 Wednesday night, 28 Sivan 1581 Seleucid \n",
+ "1709 2418 Monday 20 Tevet 1520 Seleucid \n",
+ "1923 2649 Sunday night, 25 Kislev 1444 Seleucid \n",
+ "2007 2739 Wednesday 29th Elul 1354 Seleucid \n",
+ "3248 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n",
"... ... ... ... \n",
- "29303 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n",
- "29924 35264 Wednesday 13 Ṭevet 1526 Seleucid \n",
- "34008 39564 Monday 16 Tevet 1339 Seleucid \n",
- "34466 40035 Monday 1st Iyyar 1437 Seleucid \n",
- "34467 40036 Friday 15 of Adar 1443 Seleucid \n",
+ "29175 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n",
+ "29792 35264 Wednesday 13 Ṭevet 1526 Seleucid \n",
+ "33867 39564 Monday 16 Tevet 1339 Seleucid \n",
+ "34322 40035 Monday 1st Iyyar 1437 Seleucid \n",
+ "34323 40036 Friday 15 of Adar 1443 Seleucid \n",
"\n",
" doc_date_standard undate_standard undate_orig orig_date_precision \\\n",
- "851 1270 1270 1581-03-28 day \n",
- "1714 1208-12-29 1208-12-29 1520-10-20 day \n",
- "1929 1133 1133 1444-09-25 day \n",
- "2013 1043-09-07 1043-09-07 1354-06-29 day \n",
- "3257 1229-09-18 1229-09-18 1541-07-29 day \n",
+ "847 1270 1270 1581-03-28 day \n",
+ "1709 1208-12-29 1208-12-29 1520-10-20 day \n",
+ "1923 1133 1133 1444-09-25 day \n",
+ "2007 1043-09-07 1043-09-07 1354-06-29 day \n",
+ "3248 1229-09-18 1229-09-18 1541-07-29 day \n",
"... ... ... ... ... \n",
- "29303 1266/1267 1266/1267 1578-10-20 day \n",
- "29924 1214/1215 1214/1215 1526-10-13 day \n",
- "34008 1027-12-18 1027-12-18 1339-10-16 day \n",
- "34466 1126-04-26 1126-04-26 1437-02-01 day \n",
- "34467 1132-03-04 1132-03-04 1443-12-15 day \n",
+ "29175 1266/1267 1266/1267 1578-10-20 day \n",
+ "29792 1214/1215 1214/1215 1526-10-13 day \n",
+ "33867 1027-12-18 1027-12-18 1339-10-16 day \n",
+ "34322 1126-04-26 1126-04-26 1437-02-01 day \n",
+ "34323 1132-03-04 1132-03-04 1443-12-15 day \n",
"\n",
" type undate_weekday undate_weekday_name orig_weekday \n",
- "851 Legal document 3 Thursday Thursday \n",
- "1714 Legal document 0 Monday Monday \n",
- "1929 Legal document 0 Monday Monday \n",
- "2013 Legal document 2 Wednesday Wednesday \n",
- "3257 Legal document 3 Thursday Thursday \n",
+ "847 Legal document 3 Thursday Thursday \n",
+ "1709 Legal document 0 Monday Monday \n",
+ "1923 Legal document 0 Monday Monday \n",
+ "2007 Legal document 2 Wednesday Wednesday \n",
+ "3248 Legal document 3 Thursday Thursday \n",
"... ... ... ... ... \n",
- "29303 Legal document 0 Monday Monday \n",
- "29924 Legal document 2 Wednesday Wednesday \n",
- "34008 Legal document 0 Monday Monday \n",
- "34466 Legal document 0 Monday Monday \n",
- "34467 Legal document 4 Friday Friday \n",
+ "29175 Legal document 0 Monday Monday \n",
+ "29792 Legal document 2 Wednesday Wednesday \n",
+ "33867 Legal document 0 Monday Monday \n",
+ "34322 Legal document 0 Monday Monday \n",
+ "34323 Legal document 4 Friday Friday \n",
"\n",
- "[104 rows x 11 columns]"
+ "[106 rows x 11 columns]"
]
},
- "execution_count": 28,
+ "execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
@@ -2552,23 +2726,38 @@
"days = [\"Monday\", \"Tuesday\", \"Wednesday\", \"Thursday\", \"Friday\", \"Saturday\", \"Sunday\"]\n",
"\n",
"# get numeric weekday; since these dates are all day-precision we can just use the earliest date\n",
- "weekday_dates['undate_weekday'] = weekday_dates.undate_orig.apply(lambda x: x.earliest.weekday)\n",
- "weekday_dates['undate_weekday_name'] = weekday_dates.undate_weekday.apply(lambda x: days[x])\n",
+ "weekday_dates[\"undate_weekday\"] = weekday_dates.undate_orig.apply(\n",
+ " lambda x: x.earliest.weekday\n",
+ ")\n",
+ "weekday_dates[\"undate_weekday_name\"] = weekday_dates.undate_weekday.apply(\n",
+ " lambda x: days[x]\n",
+ ")\n",
"# extract weekday from date label\n",
- "weekday_dates['orig_weekday'] = weekday_dates.doc_date_original.str.extract('([a-zA-Z]+day)', expand=False).str.strip()\n",
+ "weekday_dates[\"orig_weekday\"] = weekday_dates.doc_date_original.str.extract(\n",
+ " \"([a-zA-Z]+day)\", expand=False\n",
+ ").str.strip()\n",
"# correct misspellings\n",
"misspelled_days = {\n",
" \"Wedensday\": \"Wednesday\",\n",
" \"Thrusday\": \"Thursday\",\n",
"}\n",
- "weekday_dates['orig_weekday'] = weekday_dates.orig_weekday.apply(lambda x: misspelled_days.get(x, x))\n",
+ "weekday_dates[\"orig_weekday\"] = weekday_dates.orig_weekday.apply(\n",
+ " lambda x: misspelled_days.get(x, x)\n",
+ ")\n",
+ "\n",
"\n",
"# shift night to next day, e.g. Wednesday night should be Thursday\n",
"# NOTE: this must be done immediately after the day extraction, otherwise repeated runs continue shifting to the next day\n",
"def next_day(weekday):\n",
- " return days[(days.index(weekday) +1) % 7]\n",
+ " return days[(days.index(weekday) + 1) % 7]\n",
"\n",
- "weekday_dates['orig_weekday'] = weekday_dates.apply(lambda row: next_day(row.orig_weekday) if \" night\" in row.doc_date_original else row.orig_weekday, axis=1)\n",
+ "\n",
+ "weekday_dates[\"orig_weekday\"] = weekday_dates.apply(\n",
+ " lambda row: next_day(row.orig_weekday)\n",
+ " if \" night\" in row.doc_date_original\n",
+ " else row.orig_weekday,\n",
+ " axis=1,\n",
+ ")\n",
"\n",
"weekday_dates"
]
@@ -2583,7 +2772,7 @@
},
{
"cell_type": "code",
- "execution_count": 29,
+ "execution_count": 30,
"id": "4ced7809-1414-44ae-aae7-c2d0d1dee9ad",
"metadata": {},
"outputs": [
@@ -2623,7 +2812,7 @@
" \n",
" \n",
" \n",
- " | 851 | \n",
+ " 847 | \n",
" 1377 | \n",
" Wednesday night, 28 Sivan 1581 | \n",
" Seleucid | \n",
@@ -2637,7 +2826,7 @@
" Thursday | \n",
"
\n",
" \n",
- " | 1929 | \n",
+ " 1923 | \n",
" 2649 | \n",
" Sunday night, 25 Kislev 1444 | \n",
" Seleucid | \n",
@@ -2651,7 +2840,7 @@
" Monday | \n",
"
\n",
" \n",
- " | 3257 | \n",
+ " 3248 | \n",
" 4026 | \n",
" Wednesday night, 29 Tishrei 1541 | \n",
" Seleucid | \n",
@@ -2665,7 +2854,7 @@
" Thursday | \n",
"
\n",
" \n",
- " | 5511 | \n",
+ " 5493 | \n",
" 7237 | \n",
" Tuesday night, 22 Kislev 1435 | \n",
" Seleucid | \n",
@@ -2679,21 +2868,7 @@
" Wednesday | \n",
"
\n",
" \n",
- " | 5854 | \n",
- " 7637 | \n",
- " Monday night, 29 Ṭevet 1438 | \n",
- " Seleucid | \n",
- " 1127 | \n",
- " 1127 | \n",
- " 1438-10-29 | \n",
- " day | \n",
- " Legal document | \n",
- " 4 | \n",
- " Friday | \n",
- " Tuesday | \n",
- "
\n",
- " \n",
- " | 5857 | \n",
+ " 5839 | \n",
" 7642 | \n",
" Thursday night, 23 Tammuz 1538 | \n",
" Seleucid | \n",
@@ -2707,7 +2882,7 @@
" Friday | \n",
"
\n",
" \n",
- " | 6419 | \n",
+ " 6400 | \n",
" 8332 | \n",
" Friday night, 20 Iyar 4957 | \n",
" Anno Mundi | \n",
@@ -2721,7 +2896,7 @@
" Saturday | \n",
"
\n",
" \n",
- " | 29303 | \n",
+ " 29175 | \n",
" 34623 | \n",
" Sunday night, 20 Ṭevet 1578 | \n",
" Seleucid | \n",
@@ -2740,37 +2915,34 @@
],
"text/plain": [
" pgpid doc_date_original doc_date_calendar \\\n",
- "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n",
- "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n",
- "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n",
- "5511 7237 Tuesday night, 22 Kislev 1435 Seleucid \n",
- "5854 7637 Monday night, 29 Ṭevet 1438 Seleucid \n",
- "5857 7642 Thursday night, 23 Tammuz 1538 Seleucid \n",
- "6419 8332 Friday night, 20 Iyar 4957 Anno Mundi \n",
- "29303 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n",
+ "847 1377 Wednesday night, 28 Sivan 1581 Seleucid \n",
+ "1923 2649 Sunday night, 25 Kislev 1444 Seleucid \n",
+ "3248 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n",
+ "5493 7237 Tuesday night, 22 Kislev 1435 Seleucid \n",
+ "5839 7642 Thursday night, 23 Tammuz 1538 Seleucid \n",
+ "6400 8332 Friday night, 20 Iyar 4957 Anno Mundi \n",
+ "29175 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n",
"\n",
" doc_date_standard undate_standard undate_orig orig_date_precision \\\n",
- "851 1270 1270 1581-03-28 day \n",
- "1929 1133 1133 1444-09-25 day \n",
- "3257 1229-09-18 1229-09-18 1541-07-29 day \n",
- "5511 1123-12-12 1123-12-12 1435-09-22 day \n",
- "5854 1127 1127 1438-10-29 day \n",
- "5857 1227-07-09 1227-07-09 1538-04-23 day \n",
- "6419 1197-05 1197-05 4957-02-20 day \n",
- "29303 1266/1267 1266/1267 1578-10-20 day \n",
+ "847 1270 1270 1581-03-28 day \n",
+ "1923 1133 1133 1444-09-25 day \n",
+ "3248 1229-09-18 1229-09-18 1541-07-29 day \n",
+ "5493 1123-12-12 1123-12-12 1435-09-22 day \n",
+ "5839 1227-07-09 1227-07-09 1538-04-23 day \n",
+ "6400 1197-05 1197-05 4957-02-20 day \n",
+ "29175 1266/1267 1266/1267 1578-10-20 day \n",
"\n",
" type undate_weekday undate_weekday_name orig_weekday \n",
- "851 Legal document 3 Thursday Thursday \n",
- "1929 Legal document 0 Monday Monday \n",
- "3257 Legal document 3 Thursday Thursday \n",
- "5511 Legal document 2 Wednesday Wednesday \n",
- "5854 Legal document 4 Friday Tuesday \n",
- "5857 Legal document 4 Friday Friday \n",
- "6419 Legal document 5 Saturday Saturday \n",
- "29303 Legal document 0 Monday Monday "
+ "847 Legal document 3 Thursday Thursday \n",
+ "1923 Legal document 0 Monday Monday \n",
+ "3248 Legal document 3 Thursday Thursday \n",
+ "5493 Legal document 2 Wednesday Wednesday \n",
+ "5839 Legal document 4 Friday Friday \n",
+ "6400 Legal document 5 Saturday Saturday \n",
+ "29175 Legal document 0 Monday Monday "
]
},
- "execution_count": 29,
+ "execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
@@ -2789,7 +2961,7 @@
},
{
"cell_type": "code",
- "execution_count": 30,
+ "execution_count": 31,
"id": "fedb5323-0e9c-476e-a7e2-95443d2f9e1d",
"metadata": {},
"outputs": [
@@ -2797,7 +2969,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "44 matches, 60 mismatches (42.31%)\n"
+ "46 matches, 60 mismatches (43.40%)\n"
]
},
{
@@ -2836,7 +3008,7 @@
" \n",
"
\n",
" \n",
- " | 5271 | \n",
+ " 5255 | \n",
" 6947 | \n",
" Monday 3 Iyyar 1740 | \n",
" Seleucid | \n",
@@ -2850,21 +3022,7 @@
" Monday | \n",
"
\n",
" \n",
- " | 5854 | \n",
- " 7637 | \n",
- " Monday night, 29 Ṭevet 1438 | \n",
- " Seleucid | \n",
- " 1127 | \n",
- " 1127 | \n",
- " 1438-10-29 | \n",
- " day | \n",
- " Legal document | \n",
- " 4 | \n",
- " Friday | \n",
- " Tuesday | \n",
- "
\n",
- " \n",
- " | 8648 | \n",
+ " 8624 | \n",
" 11227 | \n",
" Monday 24 Jumādā I 517 | \n",
" Hijrī | \n",
@@ -2878,7 +3036,7 @@
" Monday | \n",
"
\n",
" \n",
- " | 16397 | \n",
+ " 16299 | \n",
" 19649 | \n",
" Thursday 26 Iyyar 5306 | \n",
" Anno Mundi | \n",
@@ -2892,7 +3050,7 @@
" Thursday | \n",
"
\n",
" \n",
- " | 17723 | \n",
+ " 17622 | \n",
" 21094 | \n",
" Saturday 20 Rajab 550 | \n",
" Hijrī | \n",
@@ -2906,7 +3064,7 @@
" Saturday | \n",
"
\n",
" \n",
- " | 23099 | \n",
+ " 22986 | \n",
" 27479 | \n",
" Tuesday 11 Tammuz 5525 | \n",
" Anno Mundi | \n",
@@ -2920,12 +3078,12 @@
" Tuesday | \n",
"
\n",
" \n",
- " | 23104 | \n",
+ " 22991 | \n",
" 27484 | \n",
- " Friday 20th Shevat 5405 | \n",
+ " Friday 20 Shevat 5405 | \n",
" Anno Mundi | \n",
- " 1645 | \n",
- " 1645 | \n",
+ " 1645-02-16 | \n",
+ " 1645-02-16 | \n",
" 5405-11-20 | \n",
" day | \n",
" Legal document | \n",
@@ -2934,7 +3092,7 @@
" Friday | \n",
"
\n",
" \n",
- " | 23105 | \n",
+ " 22992 | \n",
" 27485 | \n",
" Sunday 22 Adar 5590 | \n",
" Anno Mundi | \n",
@@ -2948,7 +3106,7 @@
" Sunday | \n",
"
\n",
" \n",
- " | 23107 | \n",
+ " 22994 | \n",
" 27487 | \n",
" Thursday 15 Shevat 5450 | \n",
" Anno Mundi | \n",
@@ -2962,7 +3120,7 @@
" Thursday | \n",
"
\n",
" \n",
- " | 23109 | \n",
+ " 22996 | \n",
" 27489 | \n",
" Sunday 6 Nisan 5528 | \n",
" Anno Mundi | \n",
@@ -2976,12 +3134,12 @@
" Sunday | \n",
"
\n",
" \n",
- " | 23110 | \n",
+ " 22997 | \n",
" 27490 | \n",
- " Thursday 19th Elul 5428 | \n",
+ " Thursday 19 Elul 5428 | \n",
" Anno Mundi | \n",
- " 1668 | \n",
- " 1668 | \n",
+ " 1668-08-26 | \n",
+ " 1668-08-26 | \n",
" 5428-06-19 | \n",
" day | \n",
" Legal document | \n",
@@ -2990,7 +3148,7 @@
" Thursday | \n",
"
\n",
" \n",
- " | 23111 | \n",
+ " 22998 | \n",
" 27491 | \n",
" Tuesday 1 Kislev 5507 | \n",
" Anno Mundi | \n",
@@ -3004,7 +3162,7 @@
" Tuesday | \n",
"
\n",
" \n",
- " | 23116 | \n",
+ " 23003 | \n",
" 27496 | \n",
" Sunday 28 Elul 5511 | \n",
" Anno Mundi | \n",
@@ -3018,12 +3176,12 @@
" Sunday | \n",
"
\n",
" \n",
- " | 23117 | \n",
+ " 23004 | \n",
" 27497 | \n",
- " Sunday 17th Sivan 5423 | \n",
+ " Sunday 17 Sivan 5423 | \n",
" Anno Mundi | \n",
- " 1663 | \n",
- " 1663 | \n",
+ " 1663-06-22 | \n",
+ " 1663-06-22 | \n",
" 5423-03-17 | \n",
" day | \n",
" Legal document | \n",
@@ -3032,12 +3190,12 @@
" Sunday | \n",
"
\n",
" \n",
- " | 23118 | \n",
+ " 23005 | \n",
" 27498 | \n",
- " Sunday 25th Tevet 5409 | \n",
+ " Sunday 25 Tevet 5409 | \n",
" Anno Mundi | \n",
- " 1648 | \n",
- " 1648 | \n",
+ " 1649-01-09 | \n",
+ " 1649-01-09 | \n",
" 5409-10-25 | \n",
" day | \n",
" Legal document | \n",
@@ -3046,7 +3204,7 @@
" Sunday | \n",
"
\n",
" \n",
- " | 23120 | \n",
+ " 23007 | \n",
" 27500 | \n",
" Thursday 4 Sivan 5516 | \n",
" Anno Mundi | \n",
@@ -3060,7 +3218,7 @@
" Thursday | \n",
"
\n",
" \n",
- " | 23127 | \n",
+ " 23014 | \n",
" 27507 | \n",
" Sunday 25 Sivan 5556 | \n",
" Anno Mundi | \n",
@@ -3074,12 +3232,12 @@
" Sunday | \n",
"
\n",
" \n",
- " | 23131 | \n",
+ " 23018 | \n",
" 27511 | \n",
- " Wednesday 28th Tevet 5399 | \n",
+ " Wednesday 28 Tevet 5399 | \n",
" Anno Mundi | \n",
- " 1640 | \n",
- " 1640 | \n",
+ " 1639-01-04 | \n",
+ " 1639-01-04 | \n",
" 5399-10-28 | \n",
" day | \n",
" Legal document | \n",
@@ -3088,12 +3246,12 @@
" Wednesday | \n",
"
\n",
" \n",
- " | 23135 | \n",
+ " 23022 | \n",
" 27515 | \n",
- " Monday 15th Iyyar 5414 | \n",
+ " Monday 15 Iyyar 5414 | \n",
" Anno Mundi | \n",
- " 1654 | \n",
- " 1654 | \n",
+ " 1654-05-02 | \n",
+ " 1654-05-02 | \n",
" 5414-02-15 | \n",
" day | \n",
" Legal document | \n",
@@ -3102,7 +3260,7 @@
" Monday | \n",
"
\n",
" \n",
- " | 23136 | \n",
+ " 23023 | \n",
" 27516 | \n",
" Thursday 24 Nisan 5481 | \n",
" Anno Mundi | \n",
@@ -3115,79 +3273,93 @@
" Monday | \n",
" Thursday | \n",
"
\n",
+ " \n",
+ " | 23053 | \n",
+ " 27546 | \n",
+ " Thursday 13th Nisan 5544 | \n",
+ " Anno Mundi | \n",
+ " 1784 | \n",
+ " 1784 | \n",
+ " 5544-01-13 | \n",
+ " day | \n",
+ " List or table | \n",
+ " 6 | \n",
+ " Sunday | \n",
+ " Thursday | \n",
+ "
\n",
" \n",
"\n",
""
],
"text/plain": [
- " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n",
- "5271 6947 Monday 3 Iyyar 1740 Seleucid 1429-04-07 \n",
- "5854 7637 Monday night, 29 Ṭevet 1438 Seleucid 1127 \n",
- "8648 11227 Monday 24 Jumādā I 517 Hijrī 1123-07-20 \n",
- "16397 19649 Thursday 26 Iyyar 5306 Anno Mundi 1546-04-28 \n",
- "17723 21094 Saturday 20 Rajab 550 Hijrī 1155-09-19 \n",
- "23099 27479 Tuesday 11 Tammuz 5525 Anno Mundi 1765-06-30 \n",
- "23104 27484 Friday 20th Shevat 5405 Anno Mundi 1645 \n",
- "23105 27485 Sunday 22 Adar 5590 Anno Mundi 1830-03-17 \n",
- "23107 27487 Thursday 15 Shevat 5450 Anno Mundi 1690-01-25 \n",
- "23109 27489 Sunday 6 Nisan 5528 Anno Mundi 1768-03-24 \n",
- "23110 27490 Thursday 19th Elul 5428 Anno Mundi 1668 \n",
- "23111 27491 Tuesday 1 Kislev 5507 Anno Mundi 1746-11-14 \n",
- "23116 27496 Sunday 28 Elul 5511 Anno Mundi 1751-09-18 \n",
- "23117 27497 Sunday 17th Sivan 5423 Anno Mundi 1663 \n",
- "23118 27498 Sunday 25th Tevet 5409 Anno Mundi 1648 \n",
- "23120 27500 Thursday 4 Sivan 5516 Anno Mundi 1756-06-02 \n",
- "23127 27507 Sunday 25 Sivan 5556 Anno Mundi 1796-07-01 \n",
- "23131 27511 Wednesday 28th Tevet 5399 Anno Mundi 1640 \n",
- "23135 27515 Monday 15th Iyyar 5414 Anno Mundi 1654 \n",
- "23136 27516 Thursday 24 Nisan 5481 Anno Mundi 1721-04-21 \n",
+ " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n",
+ "5255 6947 Monday 3 Iyyar 1740 Seleucid 1429-04-07 \n",
+ "8624 11227 Monday 24 Jumādā I 517 Hijrī 1123-07-20 \n",
+ "16299 19649 Thursday 26 Iyyar 5306 Anno Mundi 1546-04-28 \n",
+ "17622 21094 Saturday 20 Rajab 550 Hijrī 1155-09-19 \n",
+ "22986 27479 Tuesday 11 Tammuz 5525 Anno Mundi 1765-06-30 \n",
+ "22991 27484 Friday 20 Shevat 5405 Anno Mundi 1645-02-16 \n",
+ "22992 27485 Sunday 22 Adar 5590 Anno Mundi 1830-03-17 \n",
+ "22994 27487 Thursday 15 Shevat 5450 Anno Mundi 1690-01-25 \n",
+ "22996 27489 Sunday 6 Nisan 5528 Anno Mundi 1768-03-24 \n",
+ "22997 27490 Thursday 19 Elul 5428 Anno Mundi 1668-08-26 \n",
+ "22998 27491 Tuesday 1 Kislev 5507 Anno Mundi 1746-11-14 \n",
+ "23003 27496 Sunday 28 Elul 5511 Anno Mundi 1751-09-18 \n",
+ "23004 27497 Sunday 17 Sivan 5423 Anno Mundi 1663-06-22 \n",
+ "23005 27498 Sunday 25 Tevet 5409 Anno Mundi 1649-01-09 \n",
+ "23007 27500 Thursday 4 Sivan 5516 Anno Mundi 1756-06-02 \n",
+ "23014 27507 Sunday 25 Sivan 5556 Anno Mundi 1796-07-01 \n",
+ "23018 27511 Wednesday 28 Tevet 5399 Anno Mundi 1639-01-04 \n",
+ "23022 27515 Monday 15 Iyyar 5414 Anno Mundi 1654-05-02 \n",
+ "23023 27516 Thursday 24 Nisan 5481 Anno Mundi 1721-04-21 \n",
+ "23053 27546 Thursday 13th Nisan 5544 Anno Mundi 1784 \n",
"\n",
" undate_standard undate_orig orig_date_precision type \\\n",
- "5271 1429-04-07 1740-02-03 day Legal document \n",
- "5854 1127 1438-10-29 day Legal document \n",
- "8648 1123-07-20 0517-05-24 day Paraliterary text \n",
- "16397 1546-04-28 5306-02-26 day Legal document \n",
- "17723 1155-09-19 0550-07-20 day Legal document \n",
- "23099 1765-06-30 5525-04-11 day Legal document \n",
- "23104 1645 5405-11-20 day Legal document \n",
- "23105 1830-03-17 5590-12-22 day Legal document \n",
- "23107 1690-01-25 5450-11-15 day Legal document \n",
- "23109 1768-03-24 5528-01-06 day Legal document \n",
- "23110 1668 5428-06-19 day Legal document \n",
- "23111 1746-11-14 5507-09-01 day Legal document \n",
- "23116 1751-09-18 5511-06-28 day Legal document \n",
- "23117 1663 5423-03-17 day Legal document \n",
- "23118 1648 5409-10-25 day Legal document \n",
- "23120 1756-06-02 5516-03-04 day Legal document \n",
- "23127 1796-07-01 5556-03-25 day Legal document \n",
- "23131 1640 5399-10-28 day Legal document \n",
- "23135 1654 5414-02-15 day Legal document \n",
- "23136 1721-04-21 5481-01-24 day Legal document \n",
+ "5255 1429-04-07 1740-02-03 day Legal document \n",
+ "8624 1123-07-20 0517-05-24 day Paraliterary text \n",
+ "16299 1546-04-28 5306-02-26 day Legal document \n",
+ "17622 1155-09-19 0550-07-20 day Legal document \n",
+ "22986 1765-06-30 5525-04-11 day Legal document \n",
+ "22991 1645-02-16 5405-11-20 day Legal document \n",
+ "22992 1830-03-17 5590-12-22 day Legal document \n",
+ "22994 1690-01-25 5450-11-15 day Legal document \n",
+ "22996 1768-03-24 5528-01-06 day Legal document \n",
+ "22997 1668-08-26 5428-06-19 day Legal document \n",
+ "22998 1746-11-14 5507-09-01 day Legal document \n",
+ "23003 1751-09-18 5511-06-28 day Legal document \n",
+ "23004 1663-06-22 5423-03-17 day Legal document \n",
+ "23005 1649-01-09 5409-10-25 day Legal document \n",
+ "23007 1756-06-02 5516-03-04 day Legal document \n",
+ "23014 1796-07-01 5556-03-25 day Legal document \n",
+ "23018 1639-01-04 5399-10-28 day Legal document \n",
+ "23022 1654-05-02 5414-02-15 day Legal document \n",
+ "23023 1721-04-21 5481-01-24 day Legal document \n",
+ "23053 1784 5544-01-13 day List or table \n",
"\n",
" undate_weekday undate_weekday_name orig_weekday \n",
- "5271 3 Thursday Monday \n",
- "5854 4 Friday Tuesday \n",
- "8648 4 Friday Monday \n",
- "16397 2 Wednesday Thursday \n",
- "17723 0 Monday Saturday \n",
- "23099 6 Sunday Tuesday \n",
- "23104 3 Thursday Friday \n",
- "23105 2 Wednesday Sunday \n",
- "23107 2 Wednesday Thursday \n",
- "23109 3 Thursday Sunday \n",
- "23110 6 Sunday Thursday \n",
- "23111 0 Monday Tuesday \n",
- "23116 5 Saturday Sunday \n",
- "23117 4 Friday Sunday \n",
- "23118 5 Saturday Sunday \n",
- "23120 2 Wednesday Thursday \n",
- "23127 4 Friday Sunday \n",
- "23131 1 Tuesday Wednesday \n",
- "23135 5 Saturday Monday \n",
- "23136 0 Monday Thursday "
+ "5255 3 Thursday Monday \n",
+ "8624 4 Friday Monday \n",
+ "16299 2 Wednesday Thursday \n",
+ "17622 0 Monday Saturday \n",
+ "22986 6 Sunday Tuesday \n",
+ "22991 3 Thursday Friday \n",
+ "22992 2 Wednesday Sunday \n",
+ "22994 2 Wednesday Thursday \n",
+ "22996 3 Thursday Sunday \n",
+ "22997 6 Sunday Thursday \n",
+ "22998 0 Monday Tuesday \n",
+ "23003 5 Saturday Sunday \n",
+ "23004 4 Friday Sunday \n",
+ "23005 5 Saturday Sunday \n",
+ "23007 2 Wednesday Thursday \n",
+ "23014 4 Friday Sunday \n",
+ "23018 1 Tuesday Wednesday \n",
+ "23022 5 Saturday Monday \n",
+ "23023 0 Monday Thursday \n",
+ "23053 6 Sunday Thursday "
]
},
- "execution_count": 30,
+ "execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
@@ -3195,9 +3367,13 @@
"source": [
"matches = weekday_dates[weekday_dates.undate_weekday_name == weekday_dates.orig_weekday]\n",
"\n",
- "mismatches = weekday_dates[weekday_dates.undate_weekday_name != weekday_dates.orig_weekday]\n",
+ "mismatches = weekday_dates[\n",
+ " weekday_dates.undate_weekday_name != weekday_dates.orig_weekday\n",
+ "]\n",
"\n",
- "print(f\"{len(matches)} matches, {len(mismatches)} mismatches ({(len(matches)/(len(matches)+len(mismatches)))*100:0.2f}%)\")\n",
+ "print(\n",
+ " f\"{len(matches)} matches, {len(mismatches)} mismatches ({(len(matches) / (len(matches) + len(mismatches))) * 100:0.2f}%)\"\n",
+ ")\n",
"mismatches.head(20)"
]
},
@@ -3211,7 +3387,7 @@
},
{
"cell_type": "code",
- "execution_count": 31,
+ "execution_count": 32,
"id": "d6476907-1628-4d68-ab1f-43c95e123707",
"metadata": {},
"outputs": [
@@ -3219,13 +3395,13 @@
"data": {
"text/plain": [
"doc_date_calendar\n",
- "Anno Mundi 55\n",
- "Seleucid 3\n",
+ "Anno Mundi 56\n",
+ "Seleucid 2\n",
"Hijrī 2\n",
"Name: count, dtype: int64"
]
},
- "execution_count": 31,
+ "execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
@@ -3236,7 +3412,7 @@
},
{
"cell_type": "code",
- "execution_count": 32,
+ "execution_count": 33,
"id": "18b71d18-5d5b-4f92-8801-499bcf412efe",
"metadata": {},
"outputs": [
@@ -3245,16 +3421,16 @@
"text/plain": [
"orig_weekday\n",
"Wednesday 17\n",
- "Sunday 12\n",
+ "Sunday 13\n",
"Monday 10\n",
"Thursday 9\n",
- "Tuesday 7\n",
+ "Tuesday 6\n",
"Friday 4\n",
"Saturday 1\n",
"Name: count, dtype: int64"
]
},
- "execution_count": 32,
+ "execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
@@ -3265,7 +3441,7 @@
},
{
"cell_type": "code",
- "execution_count": 33,
+ "execution_count": 34,
"id": "eb7ea065-e4b5-47aa-9538-8dc9851ea572",
"metadata": {},
"outputs": [
@@ -3273,7 +3449,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
- "1 mismatches that include text 'night'\n"
+ "0 mismatches that include text 'night'\n"
]
},
{
@@ -3311,36 +3487,17 @@
" \n",
" \n",
" \n",
- " \n",
- " | 5854 | \n",
- " 7637 | \n",
- " Monday night, 29 Ṭevet 1438 | \n",
- " Seleucid | \n",
- " 1127 | \n",
- " 1127 | \n",
- " 1438-10-29 | \n",
- " day | \n",
- " Legal document | \n",
- " 4 | \n",
- " Friday | \n",
- " Tuesday | \n",
- "
\n",
" \n",
"\n",
""
],
"text/plain": [
- " pgpid doc_date_original doc_date_calendar doc_date_standard \\\n",
- "5854 7637 Monday night, 29 Ṭevet 1438 Seleucid 1127 \n",
- "\n",
- " undate_standard undate_orig orig_date_precision type \\\n",
- "5854 1127 1438-10-29 day Legal document \n",
- "\n",
- " undate_weekday undate_weekday_name orig_weekday \n",
- "5854 4 Friday Tuesday "
+ "Empty DataFrame\n",
+ "Columns: [pgpid, doc_date_original, doc_date_calendar, doc_date_standard, undate_standard, undate_orig, orig_date_precision, type, undate_weekday, undate_weekday_name, orig_weekday]\n",
+ "Index: []"
]
},
- "execution_count": 33,
+ "execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
@@ -3366,7 +3523,7 @@
},
{
"cell_type": "code",
- "execution_count": 34,
+ "execution_count": 35,
"id": "ece780b8-2eb2-4cbc-9195-27def665f7fa",
"metadata": {},
"outputs": [
@@ -3375,23 +3532,23 @@
"text/html": [
"\n",
"\n",
- "\n",
+ "\n",
""
],
"text/plain": [
"alt.Chart(...)"
]
},
- "execution_count": 34,
+ "execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# get numeric weekday\n",
- "orig_dates_parsed['undate_weekday'] = orig_dates_parsed.undate_orig.apply(lambda x: x.earliest.weekday)\n",
- "orig_dates_parsed['undate_weekday_name'] = orig_dates_parsed.undate_weekday.apply(lambda x: days[x])\n",
+ "orig_dates_parsed[\"undate_weekday\"] = orig_dates_parsed.undate_orig.apply(\n",
+ " lambda x: x.earliest.weekday\n",
+ ")\n",
+ "orig_dates_parsed[\"undate_weekday_name\"] = orig_dates_parsed.undate_weekday.apply(\n",
+ " lambda x: days[x]\n",
+ ")\n",
"\n",
"# restrict to dates with day precision; the rest are just using earliest day\n",
- "orig_dates_days = orig_dates_parsed[orig_dates_parsed.orig_date_precision == 'day']\n",
+ "orig_dates_days = orig_dates_parsed[orig_dates_parsed.orig_date_precision == \"day\"]\n",
"\n",
- "alt.Chart(orig_dates_days[['undate_weekday', 'undate_weekday_name', 'pgpid']]).mark_rect().encode(\n",
- " alt.X('undate_weekday_name', sort=days, title='weekday'),\n",
- " alt.Color('count(pgpid)', title='# of documents')\n",
- ").properties(title='document frequency by weekday')\n"
+ "alt.Chart(\n",
+ " orig_dates_days[[\"undate_weekday\", \"undate_weekday_name\", \"pgpid\"]]\n",
+ ").mark_rect().encode(\n",
+ " alt.X(\"undate_weekday_name\", sort=days, title=\"weekday\"),\n",
+ " alt.Color(\"count(pgpid)\", title=\"# of documents\"),\n",
+ ").properties(title=\"document frequency by weekday\")"
]
},
{
"cell_type": "code",
- "execution_count": 35,
+ "execution_count": 36,
"id": "6b2f24de-18ce-4f40-b300-e8cc334a338c",
"metadata": {},
"outputs": [
@@ -3475,17 +3638,17 @@
"data": {
"text/plain": [
"undate_weekday_name\n",
- "Monday 305\n",
- "Thursday 282\n",
- "Tuesday 241\n",
- "Sunday 229\n",
- "Wednesday 229\n",
- "Friday 215\n",
- "Saturday 98\n",
+ "Monday 362\n",
+ "Thursday 337\n",
+ "Tuesday 303\n",
+ "Sunday 284\n",
+ "Wednesday 267\n",
+ "Friday 265\n",
+ "Saturday 129\n",
"Name: count, dtype: int64"
]
},
- "execution_count": 35,
+ "execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
@@ -3496,7 +3659,7 @@
},
{
"cell_type": "code",
- "execution_count": 36,
+ "execution_count": 37,
"id": "dea83b43-b379-4807-8a33-8e26d7f4f8e7",
"metadata": {},
"outputs": [
@@ -3505,23 +3668,23 @@
"text/html": [
"\n",
"\n",
- "\n",
+ "\n",
""
],
"text/plain": [
"alt.FacetChart(...)"
]
},
- "execution_count": 36,
+ "execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "weekday_calendar_chart = alt.Chart(weekday_dates[['undate_weekday', 'undate_weekday_name', 'pgpid', 'doc_date_calendar']]).mark_rect().encode(\n",
- " alt.X('undate_weekday_name', sort=days, title='weekday'),\n",
- " # alt.Y('doc_date_calendar'),\n",
- " alt.Color('count(pgpid)')\n",
- ").facet(row=alt.Facet('doc_date_calendar', title=\"Original Calendar\")).properties(title='document frequency by weekday and calendar')\n",
+ "weekday_calendar_chart = (\n",
+ " alt.Chart(\n",
+ " weekday_dates[\n",
+ " [\"undate_weekday\", \"undate_weekday_name\", \"pgpid\", \"doc_date_calendar\"]\n",
+ " ]\n",
+ " )\n",
+ " .mark_rect()\n",
+ " .encode(\n",
+ " alt.X(\"undate_weekday_name\", sort=days, title=\"weekday\"),\n",
+ " # alt.Y('doc_date_calendar'),\n",
+ " alt.Color(\"count(pgpid)\"),\n",
+ " )\n",
+ " .facet(row=alt.Facet(\"doc_date_calendar\", title=\"Original Calendar\"))\n",
+ " .properties(title=\"document frequency by weekday and calendar\")\n",
+ ")\n",
"weekday_calendar_chart"
]
},
@@ -3600,7 +3773,7 @@
},
{
"cell_type": "code",
- "execution_count": 37,
+ "execution_count": 38,
"id": "cfecdb64-03b4-405b-b1f3-85e876f55680",
"metadata": {},
"outputs": [
@@ -3608,13 +3781,13 @@
"data": {
"text/plain": [
"doc_date_calendar\n",
- "Anno Mundi 82\n",
+ "Anno Mundi 84\n",
"Seleucid 20\n",
"Hijrī 2\n",
"Name: count, dtype: int64"
]
},
- "execution_count": 37,
+ "execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
@@ -3633,7 +3806,7 @@
},
{
"cell_type": "code",
- "execution_count": 38,
+ "execution_count": 39,
"id": "e66917b0-2221-42dd-a99b-df847b8e815b",
"metadata": {},
"outputs": [
@@ -3642,23 +3815,23 @@
"text/html": [
"\n",
"\n",
- "\n",
+ "\n",
""
],
"text/plain": [
"alt.FacetChart(...)"
]
},
- "execution_count": 38,
+ "execution_count": 39,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "weekday_calendar_chart.resolve_scale(color='independent')"
+ "weekday_calendar_chart.resolve_scale(color=\"independent\")"
]
},
{
@@ -3732,269 +3905,39 @@
},
{
"cell_type": "code",
- "execution_count": 42,
+ "execution_count": 4,
"id": "6a7a0bf5-f8c2-4034-8495-2fb4b297740a",
"metadata": {},
"outputs": [
{
- "data": {
- "text/html": [
- "\n",
- "\n",
- "
\n",
- " \n",
- " \n",
- " | \n",
- " pgpid | \n",
- " doc_date_original | \n",
- " doc_date_calendar | \n",
- " doc_date_standard | \n",
- " undate_standard | \n",
- " undate_orig | \n",
- " orig_date_precision | \n",
- " type | \n",
- " undate_weekday | \n",
- " undate_weekday_name | \n",
- " orig_weekday | \n",
- " century | \n",
- "
\n",
- " \n",
- " \n",
- " \n",
- " | 851 | \n",
- " 1377 | \n",
- " Wednesday night, 28 Sivan 1581 | \n",
- " Seleucid | \n",
- " 1270 | \n",
- " 1270 | \n",
- " 1581-03-28 | \n",
- " day | \n",
- " Legal document | \n",
- " 3 | \n",
- " Thursday | \n",
- " Thursday | \n",
- " 1200s | \n",
- "
\n",
- " \n",
- " | 1714 | \n",
- " 2418 | \n",
- " Monday 20 Tevet 1520 | \n",
- " Seleucid | \n",
- " 1208-12-29 | \n",
- " 1208-12-29 | \n",
- " 1520-10-20 | \n",
- " day | \n",
- " Legal document | \n",
- " 0 | \n",
- " Monday | \n",
- " Monday | \n",
- " 1200s | \n",
- "
\n",
- " \n",
- " | 1929 | \n",
- " 2649 | \n",
- " Sunday night, 25 Kislev 1444 | \n",
- " Seleucid | \n",
- " 1133 | \n",
- " 1133 | \n",
- " 1444-09-25 | \n",
- " day | \n",
- " Legal document | \n",
- " 0 | \n",
- " Monday | \n",
- " Monday | \n",
- " 1100s | \n",
- "
\n",
- " \n",
- " | 2013 | \n",
- " 2739 | \n",
- " Wednesday 29th Elul 1354 | \n",
- " Seleucid | \n",
- " 1043-09-07 | \n",
- " 1043-09-07 | \n",
- " 1354-06-29 | \n",
- " day | \n",
- " Legal document | \n",
- " 2 | \n",
- " Wednesday | \n",
- " Wednesday | \n",
- " 1000s | \n",
- "
\n",
- " \n",
- " | 3257 | \n",
- " 4026 | \n",
- " Wednesday night, 29 Tishrei 1541 | \n",
- " Seleucid | \n",
- " 1229-09-18 | \n",
- " 1229-09-18 | \n",
- " 1541-07-29 | \n",
- " day | \n",
- " Legal document | \n",
- " 3 | \n",
- " Thursday | \n",
- " Thursday | \n",
- " 1200s | \n",
- "
\n",
- " \n",
- " | ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- " ... | \n",
- "
\n",
- " \n",
- " | 29303 | \n",
- " 34623 | \n",
- " Sunday night, 20 Ṭevet 1578 | \n",
- " Seleucid | \n",
- " 1266/1267 | \n",
- " 1266/1267 | \n",
- " 1578-10-20 | \n",
- " day | \n",
- " Legal document | \n",
- " 0 | \n",
- " Monday | \n",
- " Monday | \n",
- " 1200s | \n",
- "
\n",
- " \n",
- " | 29924 | \n",
- " 35264 | \n",
- " Wednesday 13 Ṭevet 1526 | \n",
- " Seleucid | \n",
- " 1214/1215 | \n",
- " 1214/1215 | \n",
- " 1526-10-13 | \n",
- " day | \n",
- " Legal document | \n",
- " 2 | \n",
- " Wednesday | \n",
- " Wednesday | \n",
- " 1200s | \n",
- "
\n",
- " \n",
- " | 34008 | \n",
- " 39564 | \n",
- " Monday 16 Tevet 1339 | \n",
- " Seleucid | \n",
- " 1027-12-18 | \n",
- " 1027-12-18 | \n",
- " 1339-10-16 | \n",
- " day | \n",
- " Legal document | \n",
- " 0 | \n",
- " Monday | \n",
- " Monday | \n",
- " 1000s | \n",
- "
\n",
- " \n",
- " | 34466 | \n",
- " 40035 | \n",
- " Monday 1st Iyyar 1437 | \n",
- " Seleucid | \n",
- " 1126-04-26 | \n",
- " 1126-04-26 | \n",
- " 1437-02-01 | \n",
- " day | \n",
- " Legal document | \n",
- " 0 | \n",
- " Monday | \n",
- " Monday | \n",
- " 1100s | \n",
- "
\n",
- " \n",
- " | 34467 | \n",
- " 40036 | \n",
- " Friday 15 of Adar 1443 | \n",
- " Seleucid | \n",
- " 1132-03-04 | \n",
- " 1132-03-04 | \n",
- " 1443-12-15 | \n",
- " day | \n",
- " Legal document | \n",
- " 4 | \n",
- " Friday | \n",
- " Friday | \n",
- " 1100s | \n",
- "
\n",
- " \n",
- "
\n",
- "
104 rows × 12 columns
\n",
- "
"
- ],
- "text/plain": [
- " pgpid doc_date_original doc_date_calendar \\\n",
- "851 1377 Wednesday night, 28 Sivan 1581 Seleucid \n",
- "1714 2418 Monday 20 Tevet 1520 Seleucid \n",
- "1929 2649 Sunday night, 25 Kislev 1444 Seleucid \n",
- "2013 2739 Wednesday 29th Elul 1354 Seleucid \n",
- "3257 4026 Wednesday night, 29 Tishrei 1541 Seleucid \n",
- "... ... ... ... \n",
- "29303 34623 Sunday night, 20 Ṭevet 1578 Seleucid \n",
- "29924 35264 Wednesday 13 Ṭevet 1526 Seleucid \n",
- "34008 39564 Monday 16 Tevet 1339 Seleucid \n",
- "34466 40035 Monday 1st Iyyar 1437 Seleucid \n",
- "34467 40036 Friday 15 of Adar 1443 Seleucid \n",
- "\n",
- " doc_date_standard undate_standard undate_orig orig_date_precision \\\n",
- "851 1270 1270 1581-03-28 day \n",
- "1714 1208-12-29 1208-12-29 1520-10-20 day \n",
- "1929 1133 1133 1444-09-25 day \n",
- "2013 1043-09-07 1043-09-07 1354-06-29 day \n",
- "3257 1229-09-18 1229-09-18 1541-07-29 day \n",
- "... ... ... ... ... \n",
- "29303 1266/1267 1266/1267 1578-10-20 day \n",
- "29924 1214/1215 1214/1215 1526-10-13 day \n",
- "34008 1027-12-18 1027-12-18 1339-10-16 day \n",
- "34466 1126-04-26 1126-04-26 1437-02-01 day \n",
- "34467 1132-03-04 1132-03-04 1443-12-15 day \n",
- "\n",
- " type undate_weekday undate_weekday_name orig_weekday century \n",
- "851 Legal document 3 Thursday Thursday 1200s \n",
- "1714 Legal document 0 Monday Monday 1200s \n",
- "1929 Legal document 0 Monday Monday 1100s \n",
- "2013 Legal document 2 Wednesday Wednesday 1000s \n",
- "3257 Legal document 3 Thursday Thursday 1200s \n",
- "... ... ... ... ... ... \n",
- "29303 Legal document 0 Monday Monday 1200s \n",
- "29924 Legal document 2 Wednesday Wednesday 1200s \n",
- "34008 Legal document 0 Monday Monday 1000s \n",
- "34466 Legal document 0 Monday Monday 1100s \n",
- "34467 Legal document 4 Friday Friday 1100s \n",
- "\n",
- "[104 rows x 12 columns]"
- ]
- },
- "execution_count": 42,
- "metadata": {},
- "output_type": "execute_result"
+ "ename": "NameError",
+ "evalue": "name 'orig_dates_days' is not defined",
+ "output_type": "error",
+ "traceback": [
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+ "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
+ "Cell \u001b[0;32mIn[4], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m# get rough century (gregorian calendar)\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m weekday_dates[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mcentury\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[43morig_dates_days\u001b[49m\u001b[38;5;241m.\u001b[39mundate_orig\u001b[38;5;241m.\u001b[39mapply(\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mlambda\u001b[39;00m x: (\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mx\u001b[38;5;241m.\u001b[39mearliest\u001b[38;5;241m.\u001b[39myear\u001b[38;5;132;01m:\u001b[39;00m\u001b[38;5;124m04\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)[:\u001b[38;5;241m2\u001b[39m] \u001b[38;5;241m+\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m00s\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 4\u001b[0m )\n\u001b[1;32m 6\u001b[0m weekday_dates[\n\u001b[1;32m 7\u001b[0m [\n\u001b[1;32m 8\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpgpid\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 15\u001b[0m ]\n\u001b[1;32m 16\u001b[0m ]\u001b[38;5;241m.\u001b[39mhead()\n\u001b[1;32m 17\u001b[0m weekday_dates\n",
+ "\u001b[0;31mNameError\u001b[0m: name 'orig_dates_days' is not defined"
+ ]
}
],
"source": [
"# get rough century (gregorian calendar)\n",
- "weekday_dates['century'] = orig_dates_days.undate_orig.apply(lambda x: (\"%04d\" % x.earliest.year)[:2] + \"00s\")\n",
+ "weekday_dates[\"century\"] = orig_dates_days.undate_orig.apply(\n",
+ " lambda x: (f\"{x.earliest.year:04}\")[:2] + \"00s\"\n",
+ ")\n",
"\n",
- "weekday_dates[['pgpid', 'doc_date_original', 'doc_date_calendar', 'doc_date_standard', 'undate_standard', 'undate_orig', 'century']].head()\n",
+ "weekday_dates[\n",
+ " [\n",
+ " \"pgpid\",\n",
+ " \"doc_date_original\",\n",
+ " \"doc_date_calendar\",\n",
+ " \"doc_date_standard\",\n",
+ " \"undate_standard\",\n",
+ " \"undate_orig\",\n",
+ " \"century\",\n",
+ " ]\n",
+ "].head()\n",
"weekday_dates"
]
},
@@ -4086,12 +4029,13 @@
}
],
"source": [
- "\n",
- "alt.Chart(weekday_dates[['undate_weekday', 'undate_weekday_name', 'pgpid', 'century']]).mark_rect().encode(\n",
- " alt.X('undate_weekday_name', sort=days, title='weekday'),\n",
- " alt.Y('century'),\n",
- " alt.Color('count(pgpid)')\n",
- ").properties(title='document frequency by weekday and century')\n"
+ "alt.Chart(\n",
+ " weekday_dates[[\"undate_weekday\", \"undate_weekday_name\", \"pgpid\", \"century\"]]\n",
+ ").mark_rect().encode(\n",
+ " alt.X(\"undate_weekday_name\", sort=days, title=\"weekday\"),\n",
+ " alt.Y(\"century\"),\n",
+ " alt.Color(\"count(pgpid)\"),\n",
+ ").properties(title=\"document frequency by weekday and century\")"
]
},
{
@@ -4201,17 +4145,19 @@
"# what about heat map by month?\n",
"\n",
"# get numeric month\n",
- "orig_dates_parsed['undate_month'] = orig_dates_parsed.undate_orig.apply(lambda x: x.month)\n",
+ "orig_dates_parsed[\"undate_month\"] = orig_dates_parsed.undate_orig.apply(\n",
+ " lambda x: x.month\n",
+ ")\n",
"# orig_dates_parsed['undate_weekday_name'] = orig_dates_parsed.undate_weekday.apply(lambda x: days[x])\n",
"\n",
"has_month = orig_dates_parsed[orig_dates_parsed.undate_month.notna()]\n",
"\n",
- "alt.Chart(has_month[['undate_month', 'pgpid', 'doc_date_calendar']]).mark_rect().encode(\n",
- " alt.X('undate_month', title='month'),\n",
- " alt.Color('count(pgpid)', title='# of documents')\n",
- ").facet(\n",
- " row=alt.Facet('doc_date_calendar', title=\"Original Calendar\")\n",
- ").properties(title='Document frequency by month and calendar')"
+ "alt.Chart(has_month[[\"undate_month\", \"pgpid\", \"doc_date_calendar\"]]).mark_rect().encode(\n",
+ " alt.X(\"undate_month\", title=\"month\"),\n",
+ " alt.Color(\"count(pgpid)\", title=\"# of documents\"),\n",
+ ").facet(row=alt.Facet(\"doc_date_calendar\", title=\"Original Calendar\")).properties(\n",
+ " title=\"Document frequency by month and calendar\"\n",
+ ")"
]
},
{
@@ -4370,15 +4316,25 @@
"source": [
"# weekday frequency by month?\n",
"\n",
- "orig_dates_days['undate_month'] = orig_dates_days.undate_orig.apply(lambda x: x.month)\n",
+ "orig_dates_days[\"undate_month\"] = orig_dates_days.undate_orig.apply(lambda x: x.month)\n",
"\n",
- "alt.Chart(orig_dates_days[['undate_weekday', 'undate_weekday_name', 'pgpid', 'undate_month', 'doc_date_calendar']]).mark_rect().encode(\n",
- " alt.X('undate_weekday_name', sort=days, title='weekday'),\n",
- " alt.Y('undate_month', title=\"month\"),\n",
- " alt.Color('count(pgpid)')\n",
- ").facet(\n",
- " column=alt.Facet('doc_date_calendar', title=\"Original Calendar\")\n",
- ").properties(title='Document frequency by weekday and month (1,557 documents)')\n"
+ "alt.Chart(\n",
+ " orig_dates_days[\n",
+ " [\n",
+ " \"undate_weekday\",\n",
+ " \"undate_weekday_name\",\n",
+ " \"pgpid\",\n",
+ " \"undate_month\",\n",
+ " \"doc_date_calendar\",\n",
+ " ]\n",
+ " ]\n",
+ ").mark_rect().encode(\n",
+ " alt.X(\"undate_weekday_name\", sort=days, title=\"weekday\"),\n",
+ " alt.Y(\"undate_month\", title=\"month\"),\n",
+ " alt.Color(\"count(pgpid)\"),\n",
+ ").facet(column=alt.Facet(\"doc_date_calendar\", title=\"Original Calendar\")).properties(\n",
+ " title=\"Document frequency by weekday and month (1,557 documents)\"\n",
+ ")"
]
}
],
diff --git a/examples/shakespeare-and-company-project/shxco_partial_date_durations.ipynb b/examples/shakespeare-and-company-project/shxco_partial_date_durations.ipynb
index 38efa6c..f7fb16a 100644
--- a/examples/shakespeare-and-company-project/shxco_partial_date_durations.ipynb
+++ b/examples/shakespeare-and-company-project/shxco_partial_date_durations.ipynb
@@ -323,19 +323,20 @@
"outputs": [],
"source": [
"from undate import UndateInterval\n",
- "from undate.date import ONE_DAY\n",
"from undate.converters.iso8601 import ISO8601DateFormat\n",
+ "from undate.date import ONE_DAY\n",
+ "\n",
"\n",
"def undate_duration(start_date, end_date):\n",
- " isoformat = ISO8601DateFormat()\n",
+ " isoformat = ISO8601DateFormat()\n",
"\n",
- " unstart = isoformat.parse(start_date)\n",
- " unend = isoformat.parse(end_date)\n",
- " interval = UndateInterval(earliest=unstart, latest=unend)\n",
+ " unstart = isoformat.parse(start_date)\n",
+ " unend = isoformat.parse(end_date)\n",
+ " interval = UndateInterval(earliest=unstart, latest=unend)\n",
"\n",
- " # subtract one here for simplicity of comparison,\n",
- " # to reconcile differences between duration logic\n",
- " return interval.duration() - ONE_DAY"
+ " # subtract one here for simplicity of comparison,\n",
+ " # to reconcile differences between duration logic\n",
+ " return interval.duration() - ONE_DAY"
]
},
{
@@ -461,7 +462,15 @@
"# identify subscription events with duration information\n",
"subs_duration = events_df[events_df.subscription_duration_days.notna()]\n",
"# limit to fields that are relevant for this exploration\n",
- "subs_duration = subs_duration[['member_names', 'start_date', 'end_date', 'subscription_duration', 'subscription_duration_days']]\n",
+ "subs_duration = subs_duration[\n",
+ " [\n",
+ " \"member_names\",\n",
+ " \"start_date\",\n",
+ " \"end_date\",\n",
+ " \"subscription_duration\",\n",
+ " \"subscription_duration_days\",\n",
+ " ]\n",
+ "]\n",
"subs_duration.head()"
]
},
@@ -839,7 +848,9 @@
],
"source": [
"# add a new field for duration as calculated by Undate using the method defined previously\n",
- "subs_duration[\"undate_duration\"] = subs_duration.apply(lambda row: undate_duration(str(row.start_date), str(row.end_date)), axis=1)\n",
+ "subs_duration[\"undate_duration\"] = subs_duration.apply(\n",
+ " lambda row: undate_duration(str(row.start_date), str(row.end_date)), axis=1\n",
+ ")\n",
"subs_duration.head()"
]
},
@@ -1168,7 +1179,10 @@
],
"source": [
"# what's the difference between the two?\n",
- "subs_duration['duration_diff'] = subs_duration.apply(lambda row: row.undate_duration.astype(\"int\") - row.subscription_duration_days, axis=1)\n",
+ "subs_duration[\"duration_diff\"] = subs_duration.apply(\n",
+ " lambda row: row.undate_duration.astype(\"int\") - row.subscription_duration_days,\n",
+ " axis=1,\n",
+ ")\n",
"subs_duration"
]
},
@@ -1206,7 +1220,7 @@
}
],
"source": [
- "subs_duration['duration_diff'].value_counts()"
+ "subs_duration[\"duration_diff\"].value_counts()"
]
},
{
@@ -1693,7 +1707,7 @@
],
"source": [
"# lots of one-month subscriptions, what do the discrepancies look like?\n",
- "subset_subdurations[subset_subdurations.subscription_duration == '1 month'].head(15)"
+ "subset_subdurations[subset_subdurations.subscription_duration == \"1 month\"].head(15)"
]
},
{
@@ -1964,7 +1978,7 @@
],
"source": [
"# durations other than one month\n",
- "subset_subdurations[subset_subdurations.subscription_duration != '1 month'].head(15)"
+ "subset_subdurations[subset_subdurations.subscription_duration != \"1 month\"].head(15)"
]
},
{
@@ -2076,7 +2090,9 @@
"source": [
"borrow_duration = events_df[events_df.borrow_duration_days.notna()]\n",
"# limit to fields we care about for this check\n",
- "borrow_duration = borrow_duration[['member_names', 'start_date', 'end_date', 'borrow_duration_days']]\n",
+ "borrow_duration = borrow_duration[\n",
+ " [\"member_names\", \"start_date\", \"end_date\", \"borrow_duration_days\"]\n",
+ "]\n",
"borrow_duration.head()"
]
},
@@ -2323,7 +2339,9 @@
],
"source": [
"# add a new field for duration as calculated by undate\n",
- "borrow_duration[\"undate_duration\"] = borrow_duration.apply(lambda row: undate_duration(str(row.start_date), str(row.end_date)), axis=1)\n",
+ "borrow_duration[\"undate_duration\"] = borrow_duration.apply(\n",
+ " lambda row: undate_duration(str(row.start_date), str(row.end_date)), axis=1\n",
+ ")\n",
"borrow_duration.head(10)"
]
},
@@ -2496,7 +2514,9 @@
],
"source": [
"# what's the difference between the two?\n",
- "borrow_duration['duration_diff'] = borrow_duration.apply(lambda row: row.undate_duration.astype(\"int\") - row.borrow_duration_days, axis=1)\n",
+ "borrow_duration[\"duration_diff\"] = borrow_duration.apply(\n",
+ " lambda row: row.undate_duration.astype(\"int\") - row.borrow_duration_days, axis=1\n",
+ ")\n",
"borrow_duration.head(10)"
]
},
diff --git a/pyproject.toml b/pyproject.toml
index ac39aac..09c8ca5 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -6,13 +6,13 @@ build-backend = "hatchling.build"
name = "undate"
description = "library for working with uncertain, fuzzy, or partially unknown dates and date intervals"
readme = "README.md"
-license = { text = "Apache-2" }
+license = { text = "Apache-2.0" }
requires-python = ">= 3.10"
dynamic = ["version"]
dependencies = [
"lark[interegular]",
"numpy",
- "convertdate",
+ "convertdate>=2.4,<2.4.1", # changes syntax, deprecation warning
"strenum; python_version < '3.11'",
]
authors = [
@@ -42,7 +42,6 @@ classifiers = [
"Programming Language :: Python :: 3.13",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
- "License :: OSI Approved :: Apache Software License",
"Operating System :: OS Independent",
"Topic :: Software Development :: Libraries :: Python Modules",
"Topic :: Scientific/Engineering",
@@ -51,21 +50,20 @@ classifiers = [
]
-[project.optional-dependencies]
+[dependency-groups]
docs = ["sphinx>=7.0.0", "alabaster", "myst-parser", "myst-parser[linkify]"]
-test = ["pytest>=7.2", "pytest-ordering", "pytest-cov"]
+test = ["pytest>=9", "pytest-ordering", "pytest-cov"]
notebooks = ["jupyterlab", "pandas", "treon", "altair"]
-check = ["undate[docs]", "undate[notebooks]", "mypy", "ruff"]
+check = [ { include-group = "docs" }, {include-group = "notebooks"}, "mypy", "ruff"]
dev = [
"pre-commit>=2.20.0",
"twine",
"wheel",
"build",
- "undate[check]",
- "undate[docs]",
- "undate[test]",
+ { include-group = "test" },
+ { include-group = "check" },
+ { include-group = "docs" }
]
-all = ["undate", "undate[dev]"]
[project.urls]
Homepage = "https://github.com/dh-tech/undate-python"
@@ -87,9 +85,16 @@ dependencies = ["babel"]
[tool.hatch.envs.codegen.scripts]
generate = "python scripts/generate_gregorian_grammar.py"
-[tool.pytest.ini_options]
-pythonpath = "src/"
-testpaths = ["tests/"]
+[tool.pytest]
+minversion = "9"
+log_level = "INFO"
+strict = true
+addopts = ["-ra"]
+filterwarnings = ["error"]
+pythonpath = [ "src/" ]
+testpaths = [
+ "tests",
+]
markers = [
"last : run marked tests after all others",
"first : run marked tests before all others",
@@ -97,3 +102,23 @@ markers = [
[tool.mypy]
plugins = ["numpy.typing.mypy_plugin"]
+
+[tool.ruff.lint]
+# Include these rules in addition to ruff's defaults
+extend-select = [
+ "B", # flake8-bugbear
+ "C4", # flake8-comprehensions
+ "I", #isort
+ "NPY", # numpy-specific rules
+ "PERF", # perflint
+ "PTH", # flake8-use-pathlib
+ "RUF", # ruff-specific rules
+ "SIM", # flake8-simplify
+ "UP", # pyupgrade
+]
+# Can use to ignore specific rules within above selection
+ignore = []
+
+[tool.ruff.lint.per-file-ignores]
+# for test files, don't require docstrings or return type annotations
+"tests/**.py" = ["D", "ANN", "RUF"]
diff --git a/scripts/generate_gregorian_grammar.py b/scripts/generate_gregorian_grammar.py
index 822bbf7..5f821a4 100644
--- a/scripts/generate_gregorian_grammar.py
+++ b/scripts/generate_gregorian_grammar.py
@@ -10,8 +10,8 @@
"""
-from collections import defaultdict
import pathlib
+from collections import defaultdict
from babel.dates import get_month_names
diff --git a/src/undate/__init__.py b/src/undate/__init__.py
index 3f34de8..8e44222 100644
--- a/src/undate/__init__.py
+++ b/src/undate/__init__.py
@@ -1,14 +1,17 @@
__version__ = "0.7.0.dev0"
+# this sort order is important to avoid circular imports
+
+# ruff: noqa: I001
from undate.date import DatePrecision, UnDelta
-from undate.undate import Undate, Calendar
+from undate.undate import Calendar, Undate
from undate.interval import UndateInterval
__all__ = [
- "Undate",
- "UndateInterval",
"Calendar",
"DatePrecision",
"UnDelta",
+ "Undate",
+ "UndateInterval",
"__version__",
]
diff --git a/src/undate/converters/__init__.py b/src/undate/converters/__init__.py
index c13f2f1..1024ddb 100644
--- a/src/undate/converters/__init__.py
+++ b/src/undate/converters/__init__.py
@@ -24,6 +24,6 @@
"""
-from undate.converters.base import BaseDateConverter, GRAMMAR_FILE_PATH
+from undate.converters.base import GRAMMAR_FILE_PATH, BaseDateConverter
-__all__ = ["BaseDateConverter", "GRAMMAR_FILE_PATH"]
+__all__ = ["GRAMMAR_FILE_PATH", "BaseDateConverter"]
diff --git a/src/undate/converters/base.py b/src/undate/converters/base.py
index 93a63a7..3845311 100644
--- a/src/undate/converters/base.py
+++ b/src/undate/converters/base.py
@@ -47,7 +47,6 @@
import pathlib
import pkgutil
from functools import cache
-from typing import Dict, Type
from undate.date import Date
@@ -102,12 +101,12 @@ def import_converters(cls) -> int:
logger.debug("Loading converters under undate.converters")
import undate.converters
- # load packages under this path with curent package prefix
+ # load packages under this path with current package prefix
converter_path = undate.converters.__path__
converter_prefix = f"{undate.converters.__name__}."
import_count = 0
- for importer, modname, ispkg in pkgutil.iter_modules(
+ for _importer, modname, _ispkg in pkgutil.iter_modules(
converter_path, converter_prefix
):
# import everything except the current file
@@ -118,14 +117,14 @@ def import_converters(cls) -> int:
return import_count
@classmethod
- def available_converters(cls) -> Dict[str, Type["BaseDateConverter"]]:
+ def available_converters(cls) -> dict[str, type["BaseDateConverter"]]:
"""
Dictionary of available converters keyed on name.
"""
return {c.name: c for c in cls.subclasses()} # type: ignore
@classmethod
- def subclasses(cls) -> set[Type["BaseDateConverter"]]:
+ def subclasses(cls) -> set[type["BaseDateConverter"]]:
"""
Set of available converters classes. Includes descendant
subclasses, including calendar converters, but does not include
diff --git a/src/undate/converters/calendars/__init__.py b/src/undate/converters/calendars/__init__.py
index 5836b2f..f0aa6ff 100644
--- a/src/undate/converters/calendars/__init__.py
+++ b/src/undate/converters/calendars/__init__.py
@@ -6,6 +6,6 @@
__all__ = [
"GregorianDateConverter",
"HebrewDateConverter",
- "IslamicDateConverter",
+ "IslamicDateConverter",
"SeleucidDateConverter",
]
diff --git a/src/undate/converters/calendars/gregorian/converter.py b/src/undate/converters/calendars/gregorian/converter.py
index 9aa954d..31bf05d 100644
--- a/src/undate/converters/calendars/gregorian/converter.py
+++ b/src/undate/converters/calendars/gregorian/converter.py
@@ -1,11 +1,11 @@
-from calendar import monthrange, isleap
+from calendar import isleap, monthrange
from lark.exceptions import UnexpectedInput
-from undate.undate import Undate
from undate.converters.base import BaseCalendarConverter
from undate.converters.calendars.gregorian.parser import gregorian_parser
from undate.converters.calendars.gregorian.transformer import GregorianDateTransformer
+from undate.undate import Undate
class GregorianDateConverter(BaseCalendarConverter):
diff --git a/src/undate/converters/calendars/gregorian/transformer.py b/src/undate/converters/calendars/gregorian/transformer.py
index a8e7048..5fe4df4 100644
--- a/src/undate/converters/calendars/gregorian/transformer.py
+++ b/src/undate/converters/calendars/gregorian/transformer.py
@@ -1,6 +1,6 @@
from lark import Transformer, Tree
-from undate import Undate, Calendar
+from undate import Calendar, Undate
class GregorianDateTransformer(Transformer):
diff --git a/src/undate/converters/calendars/hebrew/converter.py b/src/undate/converters/calendars/hebrew/converter.py
index dc8ad19..901c8a3 100644
--- a/src/undate/converters/calendars/hebrew/converter.py
+++ b/src/undate/converters/calendars/hebrew/converter.py
@@ -1,5 +1,3 @@
-from typing import Union
-
from convertdate import hebrew # type: ignore
from lark.exceptions import UnexpectedInput
@@ -93,7 +91,7 @@ def to_gregorian(self, year: int, month: int, day: int) -> tuple[int, int, int]:
"""
return hebrew.to_gregorian(year, month, day)
- def parse(self, value: str) -> Union[Undate, UndateInterval]:
+ def parse(self, value: str) -> Undate | UndateInterval:
"""
Parse a Hebrew date string and return an :class:`~undate.undate.Undate` or
:class:`~undate.undate.UndateInterval`.
@@ -115,4 +113,4 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]:
raise ValueError(f"Could not parse '{value}' as a Hebrew date") from err
# do we need to support conversion the other direction?
- # i.e., generate a Hebrew date from an abitrary undate or undate interval?
+ # i.e., generate a Hebrew date from an arbitrary undate or undate interval?
diff --git a/src/undate/converters/calendars/hebrew/transformer.py b/src/undate/converters/calendars/hebrew/transformer.py
index 7ad32ab..8526df2 100644
--- a/src/undate/converters/calendars/hebrew/transformer.py
+++ b/src/undate/converters/calendars/hebrew/transformer.py
@@ -1,6 +1,6 @@
from lark import Transformer, Tree
-from undate import Undate, Calendar
+from undate import Calendar, Undate
class HebrewUndate(Undate):
diff --git a/src/undate/converters/calendars/islamic/converter.py b/src/undate/converters/calendars/islamic/converter.py
index fae7f7f..f0962fc 100644
--- a/src/undate/converters/calendars/islamic/converter.py
+++ b/src/undate/converters/calendars/islamic/converter.py
@@ -1,5 +1,3 @@
-from typing import Union
-
from convertdate import islamic # type: ignore
from lark.exceptions import UnexpectedInput
@@ -79,7 +77,7 @@ def to_gregorian(self, year: int, month: int, day: int) -> tuple[int, int, int]:
# NOTE: this results in weird numbers for months when year gets sufficiently high
return islamic.to_gregorian(year, month, day)
- def parse(self, value: str) -> Union[Undate, UndateInterval]:
+ def parse(self, value: str) -> Undate | UndateInterval:
"""
Parse an Islamic/Hijri date string and return an :class:`~undate.undate.Undate` or
:class:`~undate.undate.UndateInterval`.
diff --git a/src/undate/converters/calendars/islamic/transformer.py b/src/undate/converters/calendars/islamic/transformer.py
index 19430b7..0f9e48c 100644
--- a/src/undate/converters/calendars/islamic/transformer.py
+++ b/src/undate/converters/calendars/islamic/transformer.py
@@ -1,6 +1,6 @@
from lark import Transformer, Tree
-from undate import Undate, Calendar
+from undate import Calendar, Undate
class IslamicUndate(Undate):
diff --git a/src/undate/converters/combined.py b/src/undate/converters/combined.py
index 3d07c4a..3cc9ae9 100644
--- a/src/undate/converters/combined.py
+++ b/src/undate/converters/combined.py
@@ -4,18 +4,16 @@
as EDTF in Gregorian calendar.
"""
-from typing import Union
-
from lark import Lark
from lark.exceptions import UnexpectedInput
from lark.visitors import Transformer, merge_transformers
from undate import Undate, UndateInterval
-from undate.converters import BaseDateConverter, GRAMMAR_FILE_PATH
-from undate.converters.edtf.transformer import EDTFTransformer
+from undate.converters import GRAMMAR_FILE_PATH, BaseDateConverter
from undate.converters.calendars.gregorian.transformer import GregorianDateTransformer
from undate.converters.calendars.hebrew.transformer import HebrewDateTransformer
from undate.converters.calendars.islamic.transformer import IslamicDateTransformer
+from undate.converters.edtf.transformer import EDTFTransformer
from undate.converters.holidays import HolidayTransformer
@@ -68,7 +66,7 @@ class OmnibusDateConverter(BaseDateConverter):
def __init__(self):
self.transformer = combined_transformer
- def parse(self, value: str) -> Union[Undate, UndateInterval]:
+ def parse(self, value: str) -> Undate | UndateInterval:
"""
Parse a string in a supported format and return an :class:`~undate.undate.Undate`
or :class:`~undate.undate.UndateInterval`.
@@ -81,11 +79,11 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]:
parsetree = parser.parse(value)
# transform returns a list; we want the first item in the list
return self.transformer.transform(parsetree)[0]
- except UnexpectedInput:
+ except UnexpectedInput as err:
raise ValueError(
- "Parsing failed: '%s' is not in a recognized date format" % value
- )
+ f"Parsing failed: '{value}' is not in a recognized date format"
+ ) from err
- def to_string(self, undate: Union[Undate, UndateInterval]) -> str:
+ def to_string(self, undate: Undate | UndateInterval) -> str:
"Not supported by this converter. Will raise :class:`ValueError`"
raise ValueError("Omnibus converter does not support serialization")
diff --git a/src/undate/converters/edtf/converter.py b/src/undate/converters/edtf/converter.py
index e5eddac..d9804d6 100644
--- a/src/undate/converters/edtf/converter.py
+++ b/src/undate/converters/edtf/converter.py
@@ -1,5 +1,3 @@
-from typing import Optional, Union
-
from lark.exceptions import UnexpectedInput
from undate import Undate, UndateInterval
@@ -8,7 +6,6 @@
from undate.converters.edtf.transformer import EDTFTransformer
from undate.date import DatePrecision
-
#: character for unspecified digits
EDTF_UNSPECIFIED_DIGIT: str = "X"
@@ -27,7 +24,7 @@ class EDTFDateConverter(BaseDateConverter):
def __init__(self):
self.transformer = EDTFTransformer()
- def parse(self, value: str) -> Union[Undate, UndateInterval]:
+ def parse(self, value: str) -> Undate | UndateInterval:
"""
Parse a string in a supported EDTF date or date interval format and
return an :class:`~undate.undate.Undate` or
@@ -46,13 +43,13 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]:
) from err
def _convert_missing_digits(
- self, value: Optional[str], old_missing_digit: str
- ) -> Optional[str]:
+ self, value: str | None, old_missing_digit: str
+ ) -> str | None:
if value:
return value.replace(old_missing_digit, EDTF_UNSPECIFIED_DIGIT)
return None
- def to_string(self, undate: Union[Undate, UndateInterval]) -> str:
+ def to_string(self, undate: Undate | UndateInterval) -> str:
"""
Convert an :class:`~undate.undate.Undate` or
:class:`~undate.undate.UndateInterval` to EDTF format.
diff --git a/src/undate/converters/edtf/parser.py b/src/undate/converters/edtf/parser.py
index bc8f0ef..4e1bda0 100644
--- a/src/undate/converters/edtf/parser.py
+++ b/src/undate/converters/edtf/parser.py
@@ -4,5 +4,5 @@
grammar_path = GRAMMAR_FILE_PATH / "edtf.lark"
-with open(grammar_path) as grammar:
+with grammar_path.open() as grammar:
edtf_parser = Lark(grammar.read(), start="edtf")
diff --git a/src/undate/converters/holidays.py b/src/undate/converters/holidays.py
index 3d0df07..72c890a 100644
--- a/src/undate/converters/holidays.py
+++ b/src/undate/converters/holidays.py
@@ -4,12 +4,12 @@
import datetime
-from lark import Lark, Transformer, Tree, Token
+from convertdate import holidays # type: ignore[import-untyped]
+from lark import Lark, Token, Transformer, Tree
from lark.exceptions import UnexpectedInput
-from convertdate import holidays # type: ignore[import-untyped]
-from undate import Undate, Calendar
-from undate.converters.base import BaseDateConverter, GRAMMAR_FILE_PATH
+from undate import Calendar, Undate
+from undate.converters.base import GRAMMAR_FILE_PATH, BaseDateConverter
# To add a new holiday:
# 1. Add a name and pattern to holidays.lark grammar file
@@ -66,8 +66,8 @@ def fixed_date(self, items):
holiday_name = item.type.split("__")[-1]
try:
month, day = FIXED_HOLIDAYS[holiday_name]
- except KeyError:
- raise ValueError(f"Unknown fixed holiday {holiday_name}")
+ except KeyError as err:
+ raise ValueError(f"Unknown fixed holiday {holiday_name}") from err
return Tree("fixed_date", [Token("month", month), Token("day", day)])
def holiday_date(self, items):
@@ -115,8 +115,8 @@ def _get_date_parts(self, items) -> dict[str, int | str]:
if movable_feast is not None:
try:
year = parts["year"]
- except KeyError:
- raise ValueError("Year is required for movable feasts")
+ except KeyError as err:
+ raise ValueError("Year is required for movable feasts") from err
offset = MOVABLE_FEASTS[movable_feast]
holiday_date = datetime.date(*holidays.easter(year)) + datetime.timedelta(
diff --git a/src/undate/converters/iso8601.py b/src/undate/converters/iso8601.py
index 4f05b69..419d8f6 100644
--- a/src/undate/converters/iso8601.py
+++ b/src/undate/converters/iso8601.py
@@ -1,4 +1,4 @@
-from typing import Dict, List, Union
+from typing import ClassVar
from undate import Undate, UndateInterval
from undate.converters.base import BaseDateConverter
@@ -13,13 +13,13 @@ class ISO8601DateFormat(BaseDateConverter):
# do not change; Undate relies on this string
#: datetime strftime format for known part of date
- iso_format: Dict[str, str] = {
+ iso_format: ClassVar[dict[str, str]] = {
"year": "%Y",
"month": "%m",
"day": "%d",
}
- def parse(self, value: str) -> Union[Undate, UndateInterval]:
+ def parse(self, value: str) -> Undate | UndateInterval:
"""
Parse an ISO88601 string and return an :class:`~undate.undate.Undate` or
:class:`~undate.undate.UndateInterval`. Currently supports
@@ -29,7 +29,7 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]:
# TODO: what happens if someone gives us a full isoformat date with time?
# (ignore, error?)
# TODO: what about invalid format?
- parts: List[str] = value.split("/") # split in case we have a range
+ parts: list[str] = value.split("/") # split in case we have a range
if len(parts) == 1:
return self._parse_single_date(parts[0])
elif len(parts) == 2:
@@ -43,7 +43,7 @@ def parse(self, value: str) -> Union[Undate, UndateInterval]:
def _parse_single_date(self, value: str) -> Undate:
# split single iso date into parts; convert to int or None
# special case: missing year
- date_parts: List[Union[int, None]] = []
+ date_parts: list[int | None] = []
if value.startswith("--"):
date_parts.append(None) # year unknown
value = value[2:]
@@ -53,7 +53,7 @@ def _parse_single_date(self, value: str) -> Undate:
# Argument of type "int | None" cannot be assigned to parameter "formatter" of type "BaseDateFormat | None" in function "__init__"
return Undate(*date_parts) # type: ignore
- def to_string(self, undate: Union[Undate, UndateInterval]) -> str:
+ def to_string(self, undate: Undate | UndateInterval) -> str:
"""
Convert an :class:`~undate.undate.Undate` or
:class:`~undate.undate.UndateInterval` to ISO8601 string format.
@@ -70,13 +70,13 @@ def to_string(self, undate: Union[Undate, UndateInterval]) -> str:
def _undate_to_string(self, undate: Undate) -> str:
# serialize to iso format for simplicity, for now
- date_parts: List[Union[str, None]] = []
+ date_parts: list[str | None] = []
# for each part of the date that is known, generate the string format
# then combine
# TODO: should error if we have year and day but no month
# TODO: may want to refactor and take advantage of the year/month/day properties
# added for use in EDTF formatter code
- for date_portion, iso_format in self.iso_format.items():
+ for date_portion in self.iso_format:
# is known means fully known, means guaranteed integer
if undate.is_known(date_portion):
# NOTE: datetime strftime for %Y for 3-digit year
@@ -84,26 +84,26 @@ def _undate_to_string(self, undate: Undate) -> str:
# and not others; force year to always be 4 digits
if date_portion == "year" and undate.year:
try:
- date_parts.append("%04d" % int(undate.year))
+ date_parts.append(f"{undate.year:04}")
except ValueError:
# shouldn't happen because of is_known
date_parts.append(undate.year)
elif date_portion == "month" and undate.month:
try:
- date_parts.append("%02d" % int(undate.month))
+ date_parts.append(f"{undate.month:02}")
except ValueError:
# shouldn't happen because of is_known
date_parts.append(undate.month)
elif date_portion == "day" and undate.day:
try:
- date_parts.append("%02d" % int(undate.day))
+ date_parts.append(f"{undate.day:02}")
except ValueError:
# shouldn't happen because of is_known
date_parts.append(undate.day)
elif date_portion == "year":
# if year is not known, add '-' for year portion,
- # to genereate --MM-DD unknown year format
+ # to generate --MM-DD unknown year format
date_parts.append("-")
# TODO: fix type error: "list[str | None]" is incompatible with "Iterable[str]"
return "-".join(date_parts) # type: ignore
diff --git a/src/undate/date.py b/src/undate/date.py
index ee87d30..9ef3da3 100644
--- a/src/undate/date.py
+++ b/src/undate/date.py
@@ -1,10 +1,9 @@
-from enum import IntEnum
-from dataclasses import dataclass, replace
import operator
+from collections.abc import Iterable
+from dataclasses import dataclass, replace
+from enum import IntEnum
# Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None
-from typing import Optional, Union, Iterable
-
import numpy as np
@@ -12,7 +11,7 @@ class Timedelta(np.ndarray):
"""Convenience class to make :class:`numpy.timedelta64` act
more like the built-in python :class:`datetime.timedelta`."""
- def __new__(cls, deltadays: Union[np.timedelta64, int]):
+ def __new__(cls, deltadays: np.timedelta64 | int):
if isinstance(deltadays, int):
deltadays = np.timedelta64(deltadays, "D")
data = np.asarray(deltadays, dtype="timedelta64")
@@ -186,9 +185,9 @@ class Date(np.ndarray):
def __new__(
cls,
- year: Union[int, np.datetime64],
- month: Optional[int] = None,
- day: Optional[int] = None,
+ year: int | np.datetime64,
+ month: int | None = None,
+ day: int | None = None,
):
if isinstance(year, np.datetime64):
_data = year
@@ -231,21 +230,21 @@ def year(self) -> int:
return int(str(self.astype("datetime64[Y]")))
@property
- def month(self) -> Optional[int]:
+ def month(self) -> int | None:
# if date unit is year, don't return a month (only M/D)
if self.dtype != "datetime64[Y]":
return int(str(self.astype("datetime64[M]")).split("-")[-1])
return None
@property
- def day(self) -> Optional[int]:
+ def day(self) -> int | None:
# only return a day if date unit is in days
if self.dtype == "datetime64[D]":
return int(str(self.astype("datetime64[D]")).split("-")[-1])
return None
@property
- def weekday(self) -> Optional[int]:
+ def weekday(self) -> int | None:
"""Equivalent to :meth:`datetime.date.weekday`; returns day of week as an
integer where Monday is 0 and Sunday is 6. Only supported for dates
with date unit in days.
@@ -297,7 +296,7 @@ class DatePrecision(IntEnum):
of the date is known."""
# NOTE: values MUST be ordered based on the relative size or
- # precison of the time unit. That is, the smaller the unit, the more precise
+ # precision of the time unit. That is, the smaller the unit, the more precise
# it is: a day is more precise than a month, a month is more precise than a year,
# (DatePrecision.year < DatePrecision.month)
@@ -317,4 +316,4 @@ def __str__(self):
return f"{self.name}"
# NOTE: consider harmonizing / using numpy date units:
- # years (‘Y’), months (‘M’), weeks (‘W’), and days (‘D’)
+ # years (Y), months (M), weeks (W), and days (D)
diff --git a/src/undate/interval.py b/src/undate/interval.py
index a7fbe55..774cc89 100644
--- a/src/undate/interval.py
+++ b/src/undate/interval.py
@@ -1,10 +1,9 @@
# Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None
-from typing import Optional, Union
-
+from typing import Optional
from undate import Undate
-from undate.date import ONE_DAY, ONE_YEAR, Timedelta
from undate.converters.base import BaseDateConverter
+from undate.date import ONE_DAY, ONE_YEAR, Timedelta
class UndateInterval:
@@ -19,18 +18,18 @@ class UndateInterval:
"""
# date range between two undates
- earliest: Union[Undate, None]
- latest: Union[Undate, None]
- label: Union[str, None]
+ earliest: Undate | None
+ latest: Undate | None
+ label: str | None
# TODO: think about adding an optional precision / length /size field
# using DatePrecision for intervals of any standard duration (decade, century)
def __init__(
self,
- earliest: Optional[Undate] = None,
- latest: Optional[Undate] = None,
- label: Optional[str] = None,
+ earliest: Undate | None = None,
+ latest: Undate | None = None,
+ label: str | None = None,
):
# takes two undate objects; allows conversion from supported types
if earliest:
@@ -58,7 +57,7 @@ def __init__(
def __str__(self) -> str:
# using EDTF syntax for open ranges
- return "%s/%s" % (self.earliest or "..", self.latest or "")
+ return f"{self.earliest or '..'}/{self.latest or ''}"
def format(self, format) -> str:
"""format this undate interval as a string using the specified format;
@@ -156,12 +155,10 @@ def __contains__(self, other: object) -> bool:
# bounds of this interval
return (
self.earliest is None
- or other_earliest is not None
- and other_earliest >= self.earliest
+ or (other_earliest is not None and other_earliest >= self.earliest)
) and (
self.latest is None
- or other_latest is not None
- and other_latest <= self.latest
+ or (other_latest is not None and other_latest <= self.latest)
)
def intersection(self, other: "UndateInterval") -> Optional["UndateInterval"]:
diff --git a/src/undate/undate.py b/src/undate/undate.py
index 5ca407f..7128084 100644
--- a/src/undate/undate.py
+++ b/src/undate/undate.py
@@ -1,9 +1,8 @@
from __future__ import annotations
import datetime
-from enum import auto
-
import re
+from enum import auto
from typing import TYPE_CHECKING
if TYPE_CHECKING:
@@ -17,7 +16,6 @@
from strenum import StrEnum # type: ignore
# Pre 3.10 requires Union for multiple types, e.g. Union[int, None] instead of int | None
-from typing import Dict, Optional, Union
from undate.converters.base import BaseCalendarConverter, BaseDateConverter
from undate.date import ONE_DAY, Date, DatePrecision, Timedelta, UnDelta
@@ -60,7 +58,7 @@ class Undate:
latest: Date
#: A string to label a specific undate, e.g. "German Unity Date 2022" for Oct. 3, 2022.
#: Labels are not taken into account when comparing undate objects.
- label: Union[str, None] = None
+ label: str | None = None
converter: BaseDateConverter
#: precision of the date (day, month, year, etc.)
precision: DatePrecision
@@ -77,20 +75,20 @@ class Undate:
def __init__(
self,
- year: Optional[Union[int, str]] = None,
- month: Optional[Union[int, str]] = None,
- day: Optional[Union[int, str]] = None,
- converter: Optional[BaseDateConverter] = None,
- label: Optional[str] = None,
- calendar: Optional[Union[str, Calendar]] = None,
+ year: int | str | None = None,
+ month: int | str | None = None,
+ day: int | str | None = None,
+ converter: BaseDateConverter | None = None,
+ label: str | None = None,
+ calendar: str | Calendar | None = None,
):
# everything is optional but something is required
- if all([val is None for val in [year, month, day]]):
+ if all(val is None for val in [year, month, day]):
raise ValueError("At least one of year, month, or day must be specified")
# keep track of initial values and which values are known
# TODO: add validation: if str, must be expected length
- self.initial_values: Dict[str, Optional[Union[int, str]]] = {
+ self.initial_values: dict[str, int | str | None] = {
"year": year,
"month": month,
"day": day,
@@ -168,7 +166,7 @@ def calculate_earliest_latest(self, year, month, day):
day = None
# if day is numeric, use as is
- if isinstance(day, int) or isinstance(day, str) and day.isnumeric():
+ if isinstance(day, int) or (isinstance(day, str) and day.isnumeric()):
day = int(day)
# update initial value - fully known day
self.initial_values["day"] = day
@@ -177,7 +175,7 @@ def calculate_earliest_latest(self, year, month, day):
# if we have no day or partial day, calculate min / max
min_day = 1 # is min day ever anything other than 1 ?
rel_year = year if year and isinstance(year, int) else max_year
- # use month if it is an integer; otherwise use previusly determined
+ # use month if it is an integer; otherwise use previously determined
# max month (which may not be 12 depending if partially unknown)
rel_month = month if month and isinstance(month, int) else latest_month
@@ -201,7 +199,7 @@ def calculate_earliest_latest(self, year, month, day):
*self.calendar_converter.to_gregorian(max_year, latest_month, max_day)
)
- def set_calendar(self, calendar: Union[str, Calendar]):
+ def set_calendar(self, calendar: str | Calendar):
"""Find calendar by name if passed as string and set on the object.
Only intended for use at initialization time; use :meth:`as_calendar`
to change calendar."""
@@ -215,7 +213,7 @@ def set_calendar(self, calendar: Union[str, Calendar]):
raise ValueError(f"Calendar `{calendar}` is not supported") from err
self.calendar = calendar
- def as_calendar(self, calendar: Union[str, Calendar]):
+ def as_calendar(self, calendar: str | Calendar):
"""Return a new :class:`Undate` object with the same year, month, day, and labels
used to initialize the current object, but with a different calendar. Note that this
does NOT do calendar conversion, but reinterprets current numeric year, month, day values
@@ -261,7 +259,7 @@ def __repr__(self) -> str:
return f"undate.Undate({init_str})"
@classmethod
- def parse(cls, date_string, format) -> Union["Undate", UndateInterval]:
+ def parse(cls, date_string, format) -> Undate | UndateInterval:
"""parse a string to an undate or undate interval using the specified format;
for now, only supports named converters"""
converter_cls = BaseDateConverter.available_converters().get(format, None)
@@ -282,7 +280,7 @@ def format(self, format) -> str:
raise ValueError(f"Unsupported format '{format}'")
@classmethod
- def _comparison_type(cls, other: object) -> "Undate":
+ def _comparison_type(cls, other: object) -> Undate:
"""Common logic for type handling in comparison methods.
Converts to Undate object if possible, otherwise raises
NotImplementedError exception. Uses :meth:`to_undate` for conversion.
@@ -332,8 +330,8 @@ def __eq__(self, other: object) -> bool:
if looks_equal and (
# if any part of either date that is known is _partially_ known,
# then these dates are not equal
- any([self.is_partially_known(p) for p in self.initial_values.keys()])
- or any([other.is_partially_known(p) for p in other.initial_values.keys()])
+ any(self.is_partially_known(p) for p in self.initial_values)
+ or any(other.is_partially_known(p) for p in other.initial_values)
):
return False
@@ -389,14 +387,14 @@ def __gt__(self, other: object) -> bool:
# if either date has a completely unknown year, then we can't compare
# NOTE: this means that gt and lt will both be false when comparing
# with a date with an unknown year...
- if self.unknown_year or isinstance(other, Undate) and other.unknown_year:
+ if self.unknown_year or (isinstance(other, Undate) and other.unknown_year):
return False
return not (self < other or self == other)
def __le__(self, other: object) -> bool:
# if either date has a completely unknown year, then we can't compare
- if self.unknown_year or isinstance(other, Undate) and other.unknown_year:
+ if self.unknown_year or (isinstance(other, Undate) and other.unknown_year):
return False
return self == other or self < other
@@ -430,7 +428,7 @@ def __contains__(self, other: object) -> bool:
)
@classmethod
- def to_undate(cls, other: object) -> "Undate":
+ def to_undate(cls, other: object) -> Undate:
"""Convert arbitrary object to Undate, if possible. Raises TypeError
if conversion is not possible.
@@ -481,7 +479,7 @@ def is_partially_known(self, part: str) -> bool:
# and self.initial_values[part].replace(self.MISSING_DIGIT, "") != ""
@property
- def year(self) -> Optional[str]:
+ def year(self) -> str | None:
"year as string (minimum 4 characters), if year is known"
year = self._get_date_part("year")
if year:
@@ -492,7 +490,7 @@ def year(self) -> Optional[str]:
return None
@property
- def month(self) -> Optional[str]:
+ def month(self) -> str | None:
"month as 2-character string, or None if unknown/unset"
# TODO: do we allow None for unknown month with day-level granularity?
# TODO: need to distinguish between unknown (XX) and unset/not part of the date due to granularity
@@ -505,7 +503,7 @@ def month(self) -> Optional[str]:
return None
@property
- def day(self) -> Optional[str]:
+ def day(self) -> str | None:
"day as 2-character string or None if unset"
day = self._get_date_part("day")
if day:
@@ -516,7 +514,7 @@ def day(self) -> Optional[str]:
return self.MISSING_DIGIT * 2
return None
- def _get_date_part(self, part: str) -> Optional[str]:
+ def _get_date_part(self, part: str) -> str | None:
value = self.initial_values.get(part)
return str(value) if value else None
@@ -589,7 +587,7 @@ def duration(self) -> Timedelta | UnDelta:
# if year is known and no values are partially known,
# we can calculate a time delta based on earliest + latest
if self.known_year and not any(
- [self.is_partially_known(part) for part in ["year", "month", "day"]]
+ self.is_partially_known(part) for part in ["year", "month", "day"]
):
# subtract earliest from latest and add a day to include start day in the count
return self.latest - self.earliest + ONE_DAY
@@ -655,7 +653,7 @@ def _missing_digit_minmax(
# assuming two digit only (i.e., month or day)
possible_values = [f"{n:02}" for n in range(min_val, max_val + 1)]
# ensure input value has two digits
- value = "%02s" % value
+ value = f"{value:>2}"
# generate regex where missing digit matches anything
val_pattern = re.compile(value.replace(self.MISSING_DIGIT, "."))
# identify all possible matches, then get min and max
diff --git a/tests/test_converters/edtf/test_edtf_parser.py b/tests/test_converters/edtf/test_edtf_parser.py
index 73d4e02..7735d2f 100644
--- a/tests/test_converters/edtf/test_edtf_parser.py
+++ b/tests/test_converters/edtf/test_edtf_parser.py
@@ -1,4 +1,6 @@
import pytest
+from lark.exceptions import UnexpectedCharacters
+
from undate.converters.edtf.parser import edtf_parser
# for now, just test that valid dates can be parsed
@@ -51,5 +53,5 @@ def test_should_parse(date_string):
@pytest.mark.parametrize("date_string", error_cases)
def test_should_error(date_string):
- with pytest.raises(Exception):
+ with pytest.raises(UnexpectedCharacters):
edtf_parser.parse(date_string)
diff --git a/tests/test_converters/test_base.py b/tests/test_converters/test_base.py
index 6265c15..4d67eca 100644
--- a/tests/test_converters/test_base.py
+++ b/tests/test_converters/test_base.py
@@ -1,7 +1,8 @@
import logging
import pytest
-from undate.converters.base import BaseDateConverter, BaseCalendarConverter
+
+from undate.converters.base import BaseCalendarConverter, BaseDateConverter
from undate.converters.calendars import (
GregorianDateConverter,
HebrewDateConverter,
diff --git a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_converter.py b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_converter.py
index 9839b34..0b26727 100644
--- a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_converter.py
+++ b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_converter.py
@@ -1,9 +1,8 @@
import pytest
-
-from undate.date import DatePrecision
-from undate.undate import Undate, Calendar
from undate.converters.calendars import GregorianDateConverter
+from undate.date import DatePrecision
+from undate.undate import Calendar, Undate
class TestGregorianDateConverter:
diff --git a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py
index 3938bad..0acd657 100644
--- a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py
+++ b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_parser.py
@@ -3,7 +3,6 @@
from undate.converters.calendars.gregorian.parser import gregorian_parser
-
# test that valid dates can be parsed to confirm parser is working correctly
testcases = [
diff --git a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_transformer.py b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_transformer.py
index 114a713..a6107ce 100644
--- a/tests/test_converters/test_calendars/test_gregorian/test_gregorian_transformer.py
+++ b/tests/test_converters/test_calendars/test_gregorian/test_gregorian_transformer.py
@@ -1,9 +1,9 @@
import pytest
+
from undate.converters.calendars.gregorian.parser import gregorian_parser
from undate.converters.calendars.gregorian.transformer import GregorianDateTransformer
-from undate.undate import Undate, Calendar
from undate.date import DatePrecision
-
+from undate.undate import Calendar, Undate
testcases = [
("2012", Undate(2012), DatePrecision.YEAR),
diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py
index 6fe8c96..db5df6c 100644
--- a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py
+++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_converter.py
@@ -2,8 +2,8 @@
from undate.converters.calendars import HebrewDateConverter
from undate.converters.calendars.hebrew.transformer import HebrewUndate
+from undate.date import Date, DatePrecision
from undate.undate import Calendar, Undate
-from undate.date import DatePrecision, Date
class TestHebrewDateConverter:
@@ -136,7 +136,7 @@ def test_compare_across_calendars(self):
assert HebrewUndate(4816, 4, 26) > Undate(1055, 5)
# 26 Tammuz 4816: Tammuz = month 4 (17 July, 1056)
- # so it falls within or is c ontained by July 1056
+ # so it falls within or is contained by July 1056
assert HebrewUndate(4816, 4, 26) in Undate(1056, 7)
assert HebrewUndate(4816, 4, 26) not in Undate(1054)
diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py
index 69b929e..5810e70 100644
--- a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py
+++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_parser.py
@@ -3,7 +3,6 @@
from undate.converters.calendars.hebrew.parser import hebrew_parser
-
# for now, just test that valid dates can be parsed
testcases = [
diff --git a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py
index 7dcca83..ec0d1dc 100644
--- a/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py
+++ b/tests/test_converters/test_calendars/test_hebrew/test_hebrew_transformer.py
@@ -1,11 +1,12 @@
import pytest
+
from undate.converters.calendars.hebrew.parser import hebrew_parser
from undate.converters.calendars.hebrew.transformer import (
HebrewDateTransformer,
HebrewUndate,
)
-from undate.undate import Undate, Calendar
from undate.date import DatePrecision
+from undate.undate import Calendar, Undate
def test_hebrew_undate():
diff --git a/tests/test_converters/test_calendars/test_islamic/test_islamic_converter.py b/tests/test_converters/test_calendars/test_islamic/test_islamic_converter.py
index cfcace2..4f88c0c 100644
--- a/tests/test_converters/test_calendars/test_islamic/test_islamic_converter.py
+++ b/tests/test_converters/test_calendars/test_islamic/test_islamic_converter.py
@@ -2,8 +2,8 @@
from undate.converters.calendars import IslamicDateConverter
from undate.converters.calendars.islamic.transformer import IslamicUndate
+from undate.date import Date, DatePrecision
from undate.undate import Calendar, Undate
-from undate.date import DatePrecision, Date
class TestIslamicDateConverter:
diff --git a/tests/test_converters/test_calendars/test_islamic/test_islamic_parser.py b/tests/test_converters/test_calendars/test_islamic/test_islamic_parser.py
index de4901e..c8ef39f 100644
--- a/tests/test_converters/test_calendars/test_islamic/test_islamic_parser.py
+++ b/tests/test_converters/test_calendars/test_islamic/test_islamic_parser.py
@@ -3,7 +3,6 @@
from undate.converters.calendars.islamic.parser import islamic_parser
-
# for now, just test that valid dates can be parsed
testcases = [
diff --git a/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py b/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py
index 04ff53b..15e8cb5 100644
--- a/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py
+++ b/tests/test_converters/test_calendars/test_islamic/test_islamic_transformer.py
@@ -1,11 +1,12 @@
import pytest
+
from undate.converters.calendars.islamic.parser import islamic_parser
from undate.converters.calendars.islamic.transformer import (
IslamicDateTransformer,
IslamicUndate,
)
-from undate.undate import Undate, Calendar
from undate.date import DatePrecision
+from undate.undate import Calendar, Undate
def test_islamic_undate():
@@ -28,7 +29,7 @@ def test_islamic_undate():
# examples from ISMI data (reformatted to day month year)
# Rabi 1 = month 3
("14 Rabīʿ I 901", IslamicUndate(901, 3, 14), DatePrecision.DAY),
- ("Rabīʿ I 490", IslamicUndate(490, 3), DatePrecision.MONTH),
+ ("Rabīʿ I 490", IslamicUndate(490, 3), DatePrecision.MONTH),
("884", IslamicUndate(884), DatePrecision.YEAR),
# Gregorian: UndateInterval(Undate(1479, 4, 3), Undate(1480, 3, 21)),
# add when we support parsing ranges:
diff --git a/tests/test_converters/test_combined_parser.py b/tests/test_converters/test_combined_parser.py
index 02a79e6..c229880 100644
--- a/tests/test_converters/test_combined_parser.py
+++ b/tests/test_converters/test_combined_parser.py
@@ -1,8 +1,7 @@
import pytest
-from undate.converters.combined import parser, combined_transformer
-
from undate import Undate, UndateInterval
+from undate.converters.combined import combined_transformer, parser
# test that valid dates can be parsed
@@ -29,7 +28,7 @@
("Epiphany 1921", Undate(1921, 1, 6)),
("Pentecost 2016", Undate(2016, 5, 15)),
("Ash Wednesday 2000", Undate(2000, 3, 8)),
- ("Whit Monday 2023", Undate(2023, 5, 29)),
+ ("Whit Monday 2023", Undate(2023, 5, 29)), # codespell:ignore whit
]
diff --git a/tests/test_converters/test_edtf.py b/tests/test_converters/test_edtf.py
index 3262e46..e54823a 100644
--- a/tests/test_converters/test_edtf.py
+++ b/tests/test_converters/test_edtf.py
@@ -1,6 +1,7 @@
import pytest
-from undate.converters.edtf import EDTFDateConverter
+
from undate import Undate, UndateInterval
+from undate.converters.edtf import EDTFDateConverter
class TestEDTFDateConverter:
diff --git a/tests/test_converters/test_holidays.py b/tests/test_converters/test_holidays.py
index cf30bbe..1aa21a3 100644
--- a/tests/test_converters/test_holidays.py
+++ b/tests/test_converters/test_holidays.py
@@ -1,10 +1,9 @@
import pytest
-
from lark import Token, Tree
-from undate import Undate, Calendar
-from undate.date import Weekday
+from undate import Calendar, Undate
from undate.converters.holidays import HolidayDateConverter, HolidayTransformer
+from undate.date import Weekday
class TestHolidayConverter:
@@ -36,7 +35,11 @@ def test_fixed_holidays(self, input_string, expected):
("Ascension 1988", Undate(1988, 5, 12), Weekday.THURSDAY),
("Ascension Day 1999", Undate(1999, 5, 13), Weekday.THURSDAY),
("Pentecost 2016", Undate(2016, 5, 15), Weekday.SUNDAY),
- ("whit monday 2005", Undate(2005, 5, 16), Weekday.MONDAY),
+ (
+ "whit monday 2005", # codespell:ignore whit
+ Undate(2005, 5, 16),
+ Weekday.MONDAY,
+ ),
("whitsun monday 2023", Undate(2023, 5, 29), Weekday.MONDAY),
("trinity 1978", Undate(1978, 5, 21), Weekday.SUNDAY),
("Trinity Sunday 1967", Undate(1967, 5, 21), Weekday.SUNDAY),
diff --git a/tests/test_date.py b/tests/test_date.py
index fc6cc72..e0eb5d7 100644
--- a/tests/test_date.py
+++ b/tests/test_date.py
@@ -5,8 +5,8 @@
from undate.date import (
ONE_DAY,
- ONE_YEAR,
ONE_MONTH_MAX,
+ ONE_YEAR,
Date,
DatePrecision,
Timedelta,
@@ -159,7 +159,7 @@ def test_gt(self):
assert not ten_twelve > UnInt(13, 23)
# unsupported type
with pytest.raises(TypeError):
- ten_twelve > "three"
+ assert ten_twelve > "three"
def test_lt(self):
ten_twelve = UnInt(10, 12)
@@ -173,7 +173,7 @@ def test_lt(self):
assert not ten_twelve < UnInt(2, 4)
# unsupported type
with pytest.raises(TypeError):
- ten_twelve < "three"
+ assert ten_twelve < "three"
def test_iterable(self):
anymonth_days = UnInt(lower=28, upper=31)
diff --git a/tests/test_interval.py b/tests/test_interval.py
index dbf28b3..828833c 100644
--- a/tests/test_interval.py
+++ b/tests/test_interval.py
@@ -63,7 +63,7 @@ def test_repr(self):
closed_interval = UndateInterval(Undate(2022), Undate(2023))
assert (
repr(closed_interval)
- == f"undate.UndateInterval(earliest={repr(closed_interval.earliest)}, latest={repr(closed_interval.latest)})"
+ == f"undate.UndateInterval(earliest={closed_interval.earliest!r}, latest={closed_interval.latest!r})"
)
# should be able to evaluate repr string to get an equivalent object
assert eval(repr(closed_interval)) == closed_interval
@@ -71,7 +71,7 @@ def test_repr(self):
fancy_epoch = UndateInterval(Undate(2022), Undate(2023), label="Fancy Epoch")
assert (
repr(fancy_epoch)
- == f"undate.UndateInterval(earliest={repr(fancy_epoch.earliest)}, latest={repr(fancy_epoch.latest)}, label='Fancy Epoch')"
+ == f"undate.UndateInterval(earliest={fancy_epoch.earliest!r}, latest={fancy_epoch.latest!r}, label='Fancy Epoch')"
)
assert eval(repr(fancy_epoch)) == fancy_epoch
@@ -80,7 +80,7 @@ def test_repr(self):
)
assert (
repr(open_interval)
- == f"undate.UndateInterval(earliest={repr(open_interval.earliest)})"
+ == f"undate.UndateInterval(earliest={open_interval.earliest!r})"
)
assert eval(repr(open_interval)) == open_interval
diff --git a/tests/test_undate.py b/tests/test_undate.py
index 2aa855d..3d65667 100644
--- a/tests/test_undate.py
+++ b/tests/test_undate.py
@@ -4,10 +4,10 @@
import pytest
-from undate import Undate, UndateInterval, Calendar
-from undate.undate import StrEnum # import whichever version is used there
+from undate import Calendar, Undate, UndateInterval
from undate.converters.base import BaseCalendarConverter, BaseDateConverter
from undate.date import Date, DatePrecision, Timedelta, UnDelta, UnInt
+from undate.undate import StrEnum # import whichever version is used there
class TestUndate:
@@ -259,7 +259,7 @@ def test_eq(self):
assert Undate(2022, 10) == Undate(2022, 10)
assert Undate(2022, 10, 1) == Undate(2022, 10, 1)
# dates without a known year cannot known to be equal
- assert not Undate(month=2, day=7) == Undate(month=2, day=7)
+ assert Undate(month=2, day=7) != Undate(month=2, day=7)
# something we can't convert for comparison should return NotImplemented
assert Undate(2022).__eq__("not a date") == NotImplemented