[utils] Decode HTML5 entities
Used in test_Vporn_1. Also related to #9270
This commit is contained in:
		| @@ -249,6 +249,8 @@ class TestUtil(unittest.TestCase): | ||||
|         self.assertEqual(unescapeHTML('/'), '/') | ||||
|         self.assertEqual(unescapeHTML('é'), 'é') | ||||
|         self.assertEqual(unescapeHTML('�'), '�') | ||||
|         # HTML5 entities | ||||
|         self.assertEqual(unescapeHTML('.''), '.\'') | ||||
|  | ||||
|     def test_date_from_str(self): | ||||
|         self.assertEqual(date_from_str('yesterday'), date_from_str('now-1day')) | ||||
|   | ||||
| @@ -39,6 +39,7 @@ from .compat import ( | ||||
|     compat_chr, | ||||
|     compat_etree_fromstring, | ||||
|     compat_html_entities, | ||||
|     compat_html_entities_html5, | ||||
|     compat_http_client, | ||||
|     compat_kwargs, | ||||
|     compat_parse_qs, | ||||
| @@ -456,12 +457,19 @@ def orderedSet(iterable): | ||||
|     return res | ||||
|  | ||||
|  | ||||
| def _htmlentity_transform(entity): | ||||
| def _htmlentity_transform(entity_with_semicolon): | ||||
|     """Transforms an HTML entity to a character.""" | ||||
|     entity = entity_with_semicolon[:-1] | ||||
|  | ||||
|     # Known non-numeric HTML entity | ||||
|     if entity in compat_html_entities.name2codepoint: | ||||
|         return compat_chr(compat_html_entities.name2codepoint[entity]) | ||||
|  | ||||
|     # TODO: HTML5 allows entities without a semicolon. For example, | ||||
|     # 'Éric' should be decoded as 'Éric'. | ||||
|     if entity_with_semicolon in compat_html_entities_html5: | ||||
|         return compat_html_entities_html5[entity_with_semicolon] | ||||
|  | ||||
|     mobj = re.match(r'#(x[0-9a-fA-F]+|[0-9]+)', entity) | ||||
|     if mobj is not None: | ||||
|         numstr = mobj.group(1) | ||||
| @@ -486,7 +494,7 @@ def unescapeHTML(s): | ||||
|     assert type(s) == compat_str | ||||
|  | ||||
|     return re.sub( | ||||
|         r'&([^;]+);', lambda m: _htmlentity_transform(m.group(1)), s) | ||||
|         r'&([^;]+;)', lambda m: _htmlentity_transform(m.group(1)), s) | ||||
|  | ||||
|  | ||||
| def get_subprocess_encoding(): | ||||
|   | ||||
		Reference in New Issue
	
	Block a user
	 Yen Chi Hsuan
					Yen Chi Hsuan