From c43ec46327a7c09c2baac30335b5895ad5a307f5 Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Wed, 4 Oct 2017 01:24:45 +0900 Subject: [PATCH 1/7] bpo-31672: strings.Template should use re.A flag As documented, identifier should be ASCII. Since we forgot re.A flag, it matched to some non ASCII characters. For backward compatibility, we need to remove re.A flag after pattern is compiled. --- Lib/string.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Lib/string.py b/Lib/string.py index b46e60c38f4928..d7954b99fa3353 100644 --- a/Lib/string.py +++ b/Lib/string.py @@ -81,7 +81,7 @@ class Template(metaclass=_TemplateMetaclass): delimiter = '$' idpattern = r'[_a-z][_a-z0-9]*' braceidpattern = None - flags = _re.IGNORECASE + flags = _re.IGNORECASE | _re.ASCII def __init__(self, template): self.template = template @@ -157,6 +157,10 @@ def convert(mo): return self.pattern.sub(convert, self.template) +# We use re.I | re.A when compiling Template.idpattern, but restore old flag +# for backward compatibility. +Template.flags = _re.IGNORECASE + ######################################################################## # the Formatter class From 3862b335135a85d3cec3aa4bb31ca82e1be5eccb Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Wed, 4 Oct 2017 11:59:57 +0900 Subject: [PATCH 2/7] Update comment --- Lib/string.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/Lib/string.py b/Lib/string.py index d7954b99fa3353..335018dcbe49a1 100644 --- a/Lib/string.py +++ b/Lib/string.py @@ -81,6 +81,10 @@ class Template(metaclass=_TemplateMetaclass): delimiter = '$' idpattern = r'[_a-z][_a-z0-9]*' braceidpattern = None + + # We use re.I | re.A while compiling Template.idpattern in the metaclass + # above, but since flags is part of the public API, we restore its original + # documented value after class creation for backward compatibility. flags = _re.IGNORECASE | _re.ASCII def __init__(self, template): @@ -157,8 +161,7 @@ def convert(mo): return self.pattern.sub(convert, self.template) -# We use re.I | re.A when compiling Template.idpattern, but restore old flag -# for backward compatibility. +# Restore old, documented flag. See Template.flags for detail. Template.flags = _re.IGNORECASE From 586ef34cda72b35eb35cb2b39e3b8215b4b22615 Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Wed, 11 Oct 2017 12:57:09 +0900 Subject: [PATCH 3/7] Use -i local flag, as suggested by Serhiy --- Lib/string.py | 15 ++++++--------- Lib/test/test_string.py | 4 ++++ 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/Lib/string.py b/Lib/string.py index 335018dcbe49a1..a3e6d91bb4a78c 100644 --- a/Lib/string.py +++ b/Lib/string.py @@ -79,13 +79,13 @@ class Template(metaclass=_TemplateMetaclass): """A string class for supporting $-substitutions.""" delimiter = '$' - idpattern = r'[_a-z][_a-z0-9]*' + # r'[a-z]' matches to non-ASCII letters when used with IGNORECASE, + # but without ASCII flag. We can't add re.ASCII to flags because of + # backward compatibility. So we use local -i flag and [a-zA-Z] pattern. + # See https://bugs.python.org/issue31672 + idpattern = r'(?-i:[_a-zA-Z][_a-zA-Z0-9]*)' braceidpattern = None - - # We use re.I | re.A while compiling Template.idpattern in the metaclass - # above, but since flags is part of the public API, we restore its original - # documented value after class creation for backward compatibility. - flags = _re.IGNORECASE | _re.ASCII + flags = _re.IGNORECASE def __init__(self, template): self.template = template @@ -161,9 +161,6 @@ def convert(mo): return self.pattern.sub(convert, self.template) -# Restore old, documented flag. See Template.flags for detail. -Template.flags = _re.IGNORECASE - ######################################################################## # the Formatter class diff --git a/Lib/test/test_string.py b/Lib/test/test_string.py index 6e241ac72abf2b..61ce95664bcdb8 100644 --- a/Lib/test/test_string.py +++ b/Lib/test/test_string.py @@ -270,6 +270,10 @@ def test_invalid_placeholders(self): raises(ValueError, s.substitute, dict(who='tim')) s = Template('$who likes $100') raises(ValueError, s.substitute, dict(who='tim')) + # Template.idpattern should match to only ASCII characters. + # https://bugs.python.org/issue31672 + s = Template("$who likes $ı") # (0x131, DOTLESS I) + raises(ValueError, s.substitute, dict(who='tim')) def test_idpattern_override(self): class PathPattern(Template): From 8b2f42921c02f2a4c4118dfb37ce8177185511b8 Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Thu, 12 Oct 2017 02:47:28 +0900 Subject: [PATCH 4/7] Add NEWS and update document --- Doc/library/string.rst | 14 ++++++++++++-- .../2017-10-12-02-47-16.bpo-31672.DaOkVd.rst | 2 ++ 2 files changed, 14 insertions(+), 2 deletions(-) create mode 100644 Misc/NEWS.d/next/Library/2017-10-12-02-47-16.bpo-31672.DaOkVd.rst diff --git a/Doc/library/string.rst b/Doc/library/string.rst index 1a9b630975218a..d466ee22958a3c 100644 --- a/Doc/library/string.rst +++ b/Doc/library/string.rst @@ -755,8 +755,18 @@ attributes: * *idpattern* -- This is the regular expression describing the pattern for non-braced placeholders. The default value is the regular expression - ``[_a-z][_a-z0-9]*``. If this is given and *braceidpattern* is ``None`` - this pattern will also apply to braced placeholders. + ``(?-i:[_a-zA-Z][_a-zA-Z0-9]*)``. Since default *flags* is + ``re.IGNORECASE``, ``[a-z]``Without local flag ``-i``, is used to avoid to match with non ASCII characters. + If this is given and *braceidpattern* is + ``None`` this pattern will also apply to braced placeholders. + + .. note:: + + Default *flags* is ``re.IGNORECASE``. So the pattern ``[a-z]`` can match + with some non ASCII characters. That's why We use local ``-i`` flag here. + + When overrinding this class, please consider overriding *flags* with ``0`` + or ``re.IGNORECASE | re.ASCII``. .. versionchanged:: 3.7 *braceidpattern* can be used to define separate patterns used inside and diff --git a/Misc/NEWS.d/next/Library/2017-10-12-02-47-16.bpo-31672.DaOkVd.rst b/Misc/NEWS.d/next/Library/2017-10-12-02-47-16.bpo-31672.DaOkVd.rst new file mode 100644 index 00000000000000..7dd1225c3ebfd1 --- /dev/null +++ b/Misc/NEWS.d/next/Library/2017-10-12-02-47-16.bpo-31672.DaOkVd.rst @@ -0,0 +1,2 @@ +``idpattern`` in ``string.Template`` matched some non ASCII characters. Now +it uses ``-i`` regular expression local flag to avoid non ASCII characters. From 32b765b51b19855bffcfdbbe9c8e640d8d331db1 Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Fri, 13 Oct 2017 11:53:54 +0900 Subject: [PATCH 5/7] Update based on review. --- Doc/library/string.rst | 11 +++++------ Lib/test/test_string.py | 4 +++- .../Library/2017-10-12-02-47-16.bpo-31672.DaOkVd.rst | 4 ++-- 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/Doc/library/string.rst b/Doc/library/string.rst index d466ee22958a3c..d3e9d4882d1686 100644 --- a/Doc/library/string.rst +++ b/Doc/library/string.rst @@ -755,18 +755,17 @@ attributes: * *idpattern* -- This is the regular expression describing the pattern for non-braced placeholders. The default value is the regular expression - ``(?-i:[_a-zA-Z][_a-zA-Z0-9]*)``. Since default *flags* is - ``re.IGNORECASE``, ``[a-z]``Without local flag ``-i``, is used to avoid to match with non ASCII characters. - If this is given and *braceidpattern* is + ``(?-i:[_a-zA-Z][_a-zA-Z0-9]*)``. If this is given and *braceidpattern* is ``None`` this pattern will also apply to braced placeholders. .. note:: Default *flags* is ``re.IGNORECASE``. So the pattern ``[a-z]`` can match - with some non ASCII characters. That's why We use local ``-i`` flag here. + with some non-ASCII characters. That's why We use local ``-i`` flag here. - When overrinding this class, please consider overriding *flags* with ``0`` - or ``re.IGNORECASE | re.ASCII``. + While *flags* is kept to ``re.IGNORECASE`` for backward compatibility, + you can override it to ``0`` or ``re.IGNORECASE | re.ASCII`` when + subclassing. It's simple way to avoid unexpected match like above example. .. versionchanged:: 3.7 *braceidpattern* can be used to define separate patterns used inside and diff --git a/Lib/test/test_string.py b/Lib/test/test_string.py index 61ce95664bcdb8..3480459c282c1d 100644 --- a/Lib/test/test_string.py +++ b/Lib/test/test_string.py @@ -272,7 +272,9 @@ def test_invalid_placeholders(self): raises(ValueError, s.substitute, dict(who='tim')) # Template.idpattern should match to only ASCII characters. # https://bugs.python.org/issue31672 - s = Template("$who likes $ı") # (0x131, DOTLESS I) + s = Template("$who likes $\u0131") # (DOTLESS I) + raises(ValueError, s.substitute, dict(who='tim')) + s = Template("$who likes $\u0130") # (LATIN CAPITAL LETTER I WITH DOT ABOVE) raises(ValueError, s.substitute, dict(who='tim')) def test_idpattern_override(self): diff --git a/Misc/NEWS.d/next/Library/2017-10-12-02-47-16.bpo-31672.DaOkVd.rst b/Misc/NEWS.d/next/Library/2017-10-12-02-47-16.bpo-31672.DaOkVd.rst index 7dd1225c3ebfd1..b8de1f3b1db6b1 100644 --- a/Misc/NEWS.d/next/Library/2017-10-12-02-47-16.bpo-31672.DaOkVd.rst +++ b/Misc/NEWS.d/next/Library/2017-10-12-02-47-16.bpo-31672.DaOkVd.rst @@ -1,2 +1,2 @@ -``idpattern`` in ``string.Template`` matched some non ASCII characters. Now -it uses ``-i`` regular expression local flag to avoid non ASCII characters. +``idpattern`` in ``string.Template`` matched some non-ASCII characters. Now +it uses ``-i`` regular expression local flag to avoid non-ASCII characters. From 1e4bb627e906b1c845b2086369f66d6b45a7f844 Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Fri, 13 Oct 2017 15:37:40 +0900 Subject: [PATCH 6/7] Update string.rst --- Doc/library/string.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Doc/library/string.rst b/Doc/library/string.rst index d3e9d4882d1686..d58216a1ef70b0 100644 --- a/Doc/library/string.rst +++ b/Doc/library/string.rst @@ -761,7 +761,7 @@ attributes: .. note:: Default *flags* is ``re.IGNORECASE``. So the pattern ``[a-z]`` can match - with some non-ASCII characters. That's why We use local ``-i`` flag here. + with some non-ASCII characters. That's why we use local ``-i`` flag here. While *flags* is kept to ``re.IGNORECASE`` for backward compatibility, you can override it to ``0`` or ``re.IGNORECASE | re.ASCII`` when From 961a2069188fe8b929264bd499a4a3ef6a68ca76 Mon Sep 17 00:00:00 2001 From: INADA Naoki Date: Fri, 13 Oct 2017 15:39:01 +0900 Subject: [PATCH 7/7] Update string.rst --- Doc/library/string.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Doc/library/string.rst b/Doc/library/string.rst index d58216a1ef70b0..1076cdb2346dd4 100644 --- a/Doc/library/string.rst +++ b/Doc/library/string.rst @@ -760,8 +760,8 @@ attributes: .. note:: - Default *flags* is ``re.IGNORECASE``. So the pattern ``[a-z]`` can match - with some non-ASCII characters. That's why we use local ``-i`` flag here. + Since default *flags* is ``re.IGNORECASE``, pattern ``[a-z]`` can match + with some non-ASCII characters. That's why we use local ``-i`` flag here. While *flags* is kept to ``re.IGNORECASE`` for backward compatibility, you can override it to ``0`` or ``re.IGNORECASE | re.ASCII`` when