[utils] Add extract_attributes for extracting html tag attributes
This is much more robust than just using regexps, and handles all the common scenarios, such as empty/no values, repeated attributes, entity decoding, mixed case names, and the different possible value quoting schemes.
This commit is contained in:
		| @@ -28,6 +28,7 @@ from youtube_dl.utils import ( | |||||||
|     encodeFilename, |     encodeFilename, | ||||||
|     escape_rfc3986, |     escape_rfc3986, | ||||||
|     escape_url, |     escape_url, | ||||||
|  |     extract_attributes, | ||||||
|     ExtractorError, |     ExtractorError, | ||||||
|     find_xpath_attr, |     find_xpath_attr, | ||||||
|     fix_xml_ampersands, |     fix_xml_ampersands, | ||||||
| @@ -75,6 +76,7 @@ from youtube_dl.utils import ( | |||||||
|     cli_bool_option, |     cli_bool_option, | ||||||
| ) | ) | ||||||
| from youtube_dl.compat import ( | from youtube_dl.compat import ( | ||||||
|  |     compat_chr, | ||||||
|     compat_etree_fromstring, |     compat_etree_fromstring, | ||||||
| ) | ) | ||||||
|  |  | ||||||
| @@ -591,6 +593,44 @@ class TestUtil(unittest.TestCase): | |||||||
|         on = js_to_json('{"abc": "def",}') |         on = js_to_json('{"abc": "def",}') | ||||||
|         self.assertEqual(json.loads(on), {'abc': 'def'}) |         self.assertEqual(json.loads(on), {'abc': 'def'}) | ||||||
|  |  | ||||||
|  |     def test_extract_attributes(self): | ||||||
|  |         self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'}) | ||||||
|  |         self.assertEqual(extract_attributes("<e x='y'>"), {'x': 'y'}) | ||||||
|  |         self.assertEqual(extract_attributes('<e x=y>'), {'x': 'y'}) | ||||||
|  |         self.assertEqual(extract_attributes('<e x="a \'b\' c">'), {'x': "a 'b' c"}) | ||||||
|  |         self.assertEqual(extract_attributes('<e x=\'a "b" c\'>'), {'x': 'a "b" c'}) | ||||||
|  |         self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'}) | ||||||
|  |         self.assertEqual(extract_attributes('<e x="y">'), {'x': 'y'}) | ||||||
|  |         self.assertEqual(extract_attributes('<e x="&">'), {'x': '&'})  # XML | ||||||
|  |         self.assertEqual(extract_attributes('<e x=""">'), {'x': '"'}) | ||||||
|  |         self.assertEqual(extract_attributes('<e x="£">'), {'x': '£'}) # HTML 3.2 | ||||||
|  |         self.assertEqual(extract_attributes('<e x="λ">'), {'x': 'λ'}) # HTML 4.0 | ||||||
|  |         self.assertEqual(extract_attributes('<e x="&foo">'), {'x': '&foo'}) | ||||||
|  |         self.assertEqual(extract_attributes('<e x="\'">'), {'x': "'"}) | ||||||
|  |         self.assertEqual(extract_attributes('<e x=\'"\'>'), {'x': '"'}) | ||||||
|  |         self.assertEqual(extract_attributes('<e x >'), {'x': None}) | ||||||
|  |         self.assertEqual(extract_attributes('<e x=y a>'), {'x': 'y', 'a': None}) | ||||||
|  |         self.assertEqual(extract_attributes('<e x= y>'), {'x': 'y'}) | ||||||
|  |         self.assertEqual(extract_attributes('<e x=1 y=2 x=3>'), {'y': '2', 'x': '3'}) | ||||||
|  |         self.assertEqual(extract_attributes('<e \nx=\ny\n>'), {'x': 'y'}) | ||||||
|  |         self.assertEqual(extract_attributes('<e \nx=\n"y"\n>'), {'x': 'y'}) | ||||||
|  |         self.assertEqual(extract_attributes("<e \nx=\n'y'\n>"), {'x': 'y'}) | ||||||
|  |         self.assertEqual(extract_attributes('<e \nx="\ny\n">'), {'x': '\ny\n'}) | ||||||
|  |         self.assertEqual(extract_attributes('<e CAPS=x>'), {'caps': 'x'}) # Names lowercased | ||||||
|  |         self.assertEqual(extract_attributes('<e x=1 X=2>'), {'x': '2'}) | ||||||
|  |         self.assertEqual(extract_attributes('<e X=1 x=2>'), {'x': '2'}) | ||||||
|  |         self.assertEqual(extract_attributes('<e _:funny-name1=1>'), {'_:funny-name1': '1'}) | ||||||
|  |         self.assertEqual(extract_attributes('<e x="Fáilte 世界 \U0001f600">'), {'x': 'Fáilte 世界 \U0001f600'}) | ||||||
|  |         self.assertEqual(extract_attributes('<e x="décomposé">'), {'x': 'décompose\u0301'}) | ||||||
|  |         # "Narrow" Python builds don't support unicode code points outside BMP. | ||||||
|  |         try: | ||||||
|  |             compat_chr(0x10000) | ||||||
|  |             supports_outside_bmp = True | ||||||
|  |         except ValueError: | ||||||
|  |             supports_outside_bmp = False | ||||||
|  |         if supports_outside_bmp: | ||||||
|  |             self.assertEqual(extract_attributes('<e x="Smile 😀!">'), {'x': 'Smile \U0001f600!'}) | ||||||
|  |  | ||||||
|     def test_clean_html(self): |     def test_clean_html(self): | ||||||
|         self.assertEqual(clean_html('a:\nb'), 'a: b') |         self.assertEqual(clean_html('a:\nb'), 'a: b') | ||||||
|         self.assertEqual(clean_html('a:\n   "b"'), 'a:    "b"') |         self.assertEqual(clean_html('a:\n   "b"'), 'a:    "b"') | ||||||
|   | |||||||
| @@ -77,6 +77,11 @@ try: | |||||||
| except ImportError:  # Python 2 | except ImportError:  # Python 2 | ||||||
|     from urllib import urlretrieve as compat_urlretrieve |     from urllib import urlretrieve as compat_urlretrieve | ||||||
|  |  | ||||||
|  | try: | ||||||
|  |     from html.parser import HTMLParser as compat_HTMLParser | ||||||
|  | except ImportError:  # Python 2 | ||||||
|  |     from HTMLParser import HTMLParser as compat_HTMLParser | ||||||
|  |  | ||||||
|  |  | ||||||
| try: | try: | ||||||
|     from subprocess import DEVNULL |     from subprocess import DEVNULL | ||||||
| @@ -540,6 +545,7 @@ else: | |||||||
|     from tokenize import generate_tokens as compat_tokenize_tokenize |     from tokenize import generate_tokens as compat_tokenize_tokenize | ||||||
|  |  | ||||||
| __all__ = [ | __all__ = [ | ||||||
|  |     'compat_HTMLParser', | ||||||
|     'compat_HTTPError', |     'compat_HTTPError', | ||||||
|     'compat_basestring', |     'compat_basestring', | ||||||
|     'compat_chr', |     'compat_chr', | ||||||
|   | |||||||
| @@ -35,6 +35,7 @@ import xml.etree.ElementTree | |||||||
| import zlib | import zlib | ||||||
|  |  | ||||||
| from .compat import ( | from .compat import ( | ||||||
|  |     compat_HTMLParser, | ||||||
|     compat_basestring, |     compat_basestring, | ||||||
|     compat_chr, |     compat_chr, | ||||||
|     compat_etree_fromstring, |     compat_etree_fromstring, | ||||||
| @@ -272,6 +273,35 @@ def get_element_by_attribute(attribute, value, html): | |||||||
|  |  | ||||||
|     return unescapeHTML(res) |     return unescapeHTML(res) | ||||||
|  |  | ||||||
|  | class HTMLAttributeParser(compat_HTMLParser): | ||||||
|  |     """Trivial HTML parser to gather the attributes for a single element""" | ||||||
|  |     def __init__(self): | ||||||
|  |         self.attrs = { } | ||||||
|  |         compat_HTMLParser.__init__(self) | ||||||
|  |  | ||||||
|  |     def handle_starttag(self, tag, attrs): | ||||||
|  |         self.attrs = dict(attrs) | ||||||
|  |  | ||||||
|  | def extract_attributes(html_element): | ||||||
|  |     """Given a string for an HTML element such as | ||||||
|  |     <el | ||||||
|  |          a="foo" B="bar" c="&98;az" d=boz | ||||||
|  |          empty= noval entity="&" | ||||||
|  |          sq='"' dq="'" | ||||||
|  |     > | ||||||
|  |     Decode and return a dictionary of attributes. | ||||||
|  |     { | ||||||
|  |         'a': 'foo', 'b': 'bar', c: 'baz', d: 'boz', | ||||||
|  |         'empty': '', 'noval': None, 'entity': '&', | ||||||
|  |         'sq': '"', 'dq': '\'' | ||||||
|  |     }. | ||||||
|  |     NB HTMLParser is stricter in Python 2.6 & 3.2 than in later versions, | ||||||
|  |     but the cases in the unit test will work for all of 2.6, 2.7, 3.2-3.5. | ||||||
|  |     """ | ||||||
|  |     parser = HTMLAttributeParser() | ||||||
|  |     parser.feed(html_element) | ||||||
|  |     parser.close() | ||||||
|  |     return parser.attrs | ||||||
|  |  | ||||||
| def clean_html(html): | def clean_html(html): | ||||||
|     """Clean an HTML snippet into a readable string""" |     """Clean an HTML snippet into a readable string""" | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Brian Foley
					Brian Foley