Improve the OpenGraph regex
* Do not accept '>' between the property and content attributes. * Recognize the properties if the content attribute is before the property attribute using two regexes (fixes the extraction of the description for SlideshareIE).
This commit is contained in:
parent
85d61685f1
commit
ab2d524780
@ -315,13 +315,17 @@ class InfoExtractor(object):
|
|||||||
|
|
||||||
# Helper functions for extracting OpenGraph info
|
# Helper functions for extracting OpenGraph info
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def _og_regex(prop):
|
def _og_regexes(prop):
|
||||||
return r'<meta.+?property=[\'"]og:%s[\'"].+?content=(?:"(.+?)"|\'(.+?)\')' % re.escape(prop)
|
esc_prop = re.escape(prop)
|
||||||
|
return [
|
||||||
|
r'<meta[^>]+?property=[\'"]og:%s[\'"][^>]+?content=(?:"(.+?)"|\'(.+?)\')' % esc_prop,
|
||||||
|
r'<meta[^>]+?content=(?:"(.+?)"|\'(.+?)\')[^>]+?property=[\'"]og:%s[\'"]' % esc_prop,
|
||||||
|
]
|
||||||
|
|
||||||
def _og_search_property(self, prop, html, name=None, **kargs):
|
def _og_search_property(self, prop, html, name=None, **kargs):
|
||||||
if name is None:
|
if name is None:
|
||||||
name = 'OpenGraph %s' % prop
|
name = 'OpenGraph %s' % prop
|
||||||
escaped = self._search_regex(self._og_regex(prop), html, name, flags=re.DOTALL, **kargs)
|
escaped = self._search_regex(self._og_regexes(prop), html, name, flags=re.DOTALL, **kargs)
|
||||||
if escaped is None:
|
if escaped is None:
|
||||||
return None
|
return None
|
||||||
return unescapeHTML(escaped)
|
return unescapeHTML(escaped)
|
||||||
@ -336,8 +340,8 @@ class InfoExtractor(object):
|
|||||||
return self._og_search_property('title', html, **kargs)
|
return self._og_search_property('title', html, **kargs)
|
||||||
|
|
||||||
def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
|
def _og_search_video_url(self, html, name='video url', secure=True, **kargs):
|
||||||
regexes = [self._og_regex('video')]
|
regexes = self._og_regexes('video')
|
||||||
if secure: regexes.insert(0, self._og_regex('video:secure_url'))
|
if secure: regexes = self._og_regexes('video:secure_url') + regexes
|
||||||
return self._html_search_regex(regexes, html, name, **kargs)
|
return self._html_search_regex(regexes, html, name, **kargs)
|
||||||
|
|
||||||
def _rta_search(self, html):
|
def _rta_search(self, html):
|
||||||
|
Loading…
Reference in New Issue
Block a user