[ard] improve extraction(closes #23761)
- simplify extraction - extract age limit and series - bypass geo-restriction
This commit is contained in:
		| @@ -1,6 +1,7 @@ | |||||||
| # coding: utf-8 | # coding: utf-8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
|  |  | ||||||
|  | import json | ||||||
| import re | import re | ||||||
|  |  | ||||||
| from .common import InfoExtractor | from .common import InfoExtractor | ||||||
| @@ -22,7 +23,101 @@ from ..utils import ( | |||||||
| from ..compat import compat_etree_fromstring | from ..compat import compat_etree_fromstring | ||||||
|  |  | ||||||
|  |  | ||||||
| class ARDMediathekIE(InfoExtractor): | class ARDMediathekBaseIE(InfoExtractor): | ||||||
|  |     _GEO_COUNTRIES = ['DE'] | ||||||
|  |  | ||||||
|  |     def _extract_media_info(self, media_info_url, webpage, video_id): | ||||||
|  |         media_info = self._download_json( | ||||||
|  |             media_info_url, video_id, 'Downloading media JSON') | ||||||
|  |         return self._parse_media_info(media_info, video_id, '"fsk"' in webpage) | ||||||
|  |  | ||||||
|  |     def _parse_media_info(self, media_info, video_id, fsk): | ||||||
|  |         formats = self._extract_formats(media_info, video_id) | ||||||
|  |  | ||||||
|  |         if not formats: | ||||||
|  |             if fsk: | ||||||
|  |                 raise ExtractorError( | ||||||
|  |                     'This video is only available after 20:00', expected=True) | ||||||
|  |             elif media_info.get('_geoblocked'): | ||||||
|  |                 self.raise_geo_restricted( | ||||||
|  |                     'This video is not available due to geoblocking', | ||||||
|  |                     countries=self._GEO_COUNTRIES) | ||||||
|  |  | ||||||
|  |         self._sort_formats(formats) | ||||||
|  |  | ||||||
|  |         subtitles = {} | ||||||
|  |         subtitle_url = media_info.get('_subtitleUrl') | ||||||
|  |         if subtitle_url: | ||||||
|  |             subtitles['de'] = [{ | ||||||
|  |                 'ext': 'ttml', | ||||||
|  |                 'url': subtitle_url, | ||||||
|  |             }] | ||||||
|  |  | ||||||
|  |         return { | ||||||
|  |             'id': video_id, | ||||||
|  |             'duration': int_or_none(media_info.get('_duration')), | ||||||
|  |             'thumbnail': media_info.get('_previewImage'), | ||||||
|  |             'is_live': media_info.get('_isLive') is True, | ||||||
|  |             'formats': formats, | ||||||
|  |             'subtitles': subtitles, | ||||||
|  |         } | ||||||
|  |  | ||||||
|  |     def _extract_formats(self, media_info, video_id): | ||||||
|  |         type_ = media_info.get('_type') | ||||||
|  |         media_array = media_info.get('_mediaArray', []) | ||||||
|  |         formats = [] | ||||||
|  |         for num, media in enumerate(media_array): | ||||||
|  |             for stream in media.get('_mediaStreamArray', []): | ||||||
|  |                 stream_urls = stream.get('_stream') | ||||||
|  |                 if not stream_urls: | ||||||
|  |                     continue | ||||||
|  |                 if not isinstance(stream_urls, list): | ||||||
|  |                     stream_urls = [stream_urls] | ||||||
|  |                 quality = stream.get('_quality') | ||||||
|  |                 server = stream.get('_server') | ||||||
|  |                 for stream_url in stream_urls: | ||||||
|  |                     if not url_or_none(stream_url): | ||||||
|  |                         continue | ||||||
|  |                     ext = determine_ext(stream_url) | ||||||
|  |                     if quality != 'auto' and ext in ('f4m', 'm3u8'): | ||||||
|  |                         continue | ||||||
|  |                     if ext == 'f4m': | ||||||
|  |                         formats.extend(self._extract_f4m_formats( | ||||||
|  |                             update_url_query(stream_url, { | ||||||
|  |                                 'hdcore': '3.1.1', | ||||||
|  |                                 'plugin': 'aasp-3.1.1.69.124' | ||||||
|  |                             }), video_id, f4m_id='hds', fatal=False)) | ||||||
|  |                     elif ext == 'm3u8': | ||||||
|  |                         formats.extend(self._extract_m3u8_formats( | ||||||
|  |                             stream_url, video_id, 'mp4', 'm3u8_native', | ||||||
|  |                             m3u8_id='hls', fatal=False)) | ||||||
|  |                     else: | ||||||
|  |                         if server and server.startswith('rtmp'): | ||||||
|  |                             f = { | ||||||
|  |                                 'url': server, | ||||||
|  |                                 'play_path': stream_url, | ||||||
|  |                                 'format_id': 'a%s-rtmp-%s' % (num, quality), | ||||||
|  |                             } | ||||||
|  |                         else: | ||||||
|  |                             f = { | ||||||
|  |                                 'url': stream_url, | ||||||
|  |                                 'format_id': 'a%s-%s-%s' % (num, ext, quality) | ||||||
|  |                             } | ||||||
|  |                         m = re.search( | ||||||
|  |                             r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', | ||||||
|  |                             stream_url) | ||||||
|  |                         if m: | ||||||
|  |                             f.update({ | ||||||
|  |                                 'width': int(m.group('width')), | ||||||
|  |                                 'height': int(m.group('height')), | ||||||
|  |                             }) | ||||||
|  |                         if type_ == 'audio': | ||||||
|  |                             f['vcodec'] = 'none' | ||||||
|  |                         formats.append(f) | ||||||
|  |         return formats | ||||||
|  |  | ||||||
|  |  | ||||||
|  | class ARDMediathekIE(ARDMediathekBaseIE): | ||||||
|     IE_NAME = 'ARD:mediathek' |     IE_NAME = 'ARD:mediathek' | ||||||
|     _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' |     _VALID_URL = r'^https?://(?:(?:(?:www|classic)\.)?ardmediathek\.de|mediathek\.(?:daserste|rbb-online)\.de|one\.ard\.de)/(?:.*/)(?P<video_id>[0-9]+|[^0-9][^/\?]+)[^/\?]*(?:\?.*)?' | ||||||
|  |  | ||||||
| @@ -63,94 +158,6 @@ class ARDMediathekIE(InfoExtractor): | |||||||
|     def suitable(cls, url): |     def suitable(cls, url): | ||||||
|         return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url) |         return False if ARDBetaMediathekIE.suitable(url) else super(ARDMediathekIE, cls).suitable(url) | ||||||
|  |  | ||||||
|     def _extract_media_info(self, media_info_url, webpage, video_id): |  | ||||||
|         media_info = self._download_json( |  | ||||||
|             media_info_url, video_id, 'Downloading media JSON') |  | ||||||
|  |  | ||||||
|         formats = self._extract_formats(media_info, video_id) |  | ||||||
|  |  | ||||||
|         if not formats: |  | ||||||
|             if '"fsk"' in webpage: |  | ||||||
|                 raise ExtractorError( |  | ||||||
|                     'This video is only available after 20:00', expected=True) |  | ||||||
|             elif media_info.get('_geoblocked'): |  | ||||||
|                 raise ExtractorError('This video is not available due to geo restriction', expected=True) |  | ||||||
|  |  | ||||||
|         self._sort_formats(formats) |  | ||||||
|  |  | ||||||
|         duration = int_or_none(media_info.get('_duration')) |  | ||||||
|         thumbnail = media_info.get('_previewImage') |  | ||||||
|         is_live = media_info.get('_isLive') is True |  | ||||||
|  |  | ||||||
|         subtitles = {} |  | ||||||
|         subtitle_url = media_info.get('_subtitleUrl') |  | ||||||
|         if subtitle_url: |  | ||||||
|             subtitles['de'] = [{ |  | ||||||
|                 'ext': 'ttml', |  | ||||||
|                 'url': subtitle_url, |  | ||||||
|             }] |  | ||||||
|  |  | ||||||
|         return { |  | ||||||
|             'id': video_id, |  | ||||||
|             'duration': duration, |  | ||||||
|             'thumbnail': thumbnail, |  | ||||||
|             'is_live': is_live, |  | ||||||
|             'formats': formats, |  | ||||||
|             'subtitles': subtitles, |  | ||||||
|         } |  | ||||||
|  |  | ||||||
|     def _extract_formats(self, media_info, video_id): |  | ||||||
|         type_ = media_info.get('_type') |  | ||||||
|         media_array = media_info.get('_mediaArray', []) |  | ||||||
|         formats = [] |  | ||||||
|         for num, media in enumerate(media_array): |  | ||||||
|             for stream in media.get('_mediaStreamArray', []): |  | ||||||
|                 stream_urls = stream.get('_stream') |  | ||||||
|                 if not stream_urls: |  | ||||||
|                     continue |  | ||||||
|                 if not isinstance(stream_urls, list): |  | ||||||
|                     stream_urls = [stream_urls] |  | ||||||
|                 quality = stream.get('_quality') |  | ||||||
|                 server = stream.get('_server') |  | ||||||
|                 for stream_url in stream_urls: |  | ||||||
|                     if not url_or_none(stream_url): |  | ||||||
|                         continue |  | ||||||
|                     ext = determine_ext(stream_url) |  | ||||||
|                     if quality != 'auto' and ext in ('f4m', 'm3u8'): |  | ||||||
|                         continue |  | ||||||
|                     if ext == 'f4m': |  | ||||||
|                         formats.extend(self._extract_f4m_formats( |  | ||||||
|                             update_url_query(stream_url, { |  | ||||||
|                                 'hdcore': '3.1.1', |  | ||||||
|                                 'plugin': 'aasp-3.1.1.69.124' |  | ||||||
|                             }), |  | ||||||
|                             video_id, f4m_id='hds', fatal=False)) |  | ||||||
|                     elif ext == 'm3u8': |  | ||||||
|                         formats.extend(self._extract_m3u8_formats( |  | ||||||
|                             stream_url, video_id, 'mp4', m3u8_id='hls', fatal=False)) |  | ||||||
|                     else: |  | ||||||
|                         if server and server.startswith('rtmp'): |  | ||||||
|                             f = { |  | ||||||
|                                 'url': server, |  | ||||||
|                                 'play_path': stream_url, |  | ||||||
|                                 'format_id': 'a%s-rtmp-%s' % (num, quality), |  | ||||||
|                             } |  | ||||||
|                         else: |  | ||||||
|                             f = { |  | ||||||
|                                 'url': stream_url, |  | ||||||
|                                 'format_id': 'a%s-%s-%s' % (num, ext, quality) |  | ||||||
|                             } |  | ||||||
|                         m = re.search(r'_(?P<width>\d+)x(?P<height>\d+)\.mp4$', stream_url) |  | ||||||
|                         if m: |  | ||||||
|                             f.update({ |  | ||||||
|                                 'width': int(m.group('width')), |  | ||||||
|                                 'height': int(m.group('height')), |  | ||||||
|                             }) |  | ||||||
|                         if type_ == 'audio': |  | ||||||
|                             f['vcodec'] = 'none' |  | ||||||
|                         formats.append(f) |  | ||||||
|         return formats |  | ||||||
|  |  | ||||||
|     def _real_extract(self, url): |     def _real_extract(self, url): | ||||||
|         # determine video id from url |         # determine video id from url | ||||||
|         m = re.match(self._VALID_URL, url) |         m = re.match(self._VALID_URL, url) | ||||||
| @@ -302,19 +309,20 @@ class ARDIE(InfoExtractor): | |||||||
|         } |         } | ||||||
|  |  | ||||||
|  |  | ||||||
| class ARDBetaMediathekIE(InfoExtractor): | class ARDBetaMediathekIE(ARDMediathekBaseIE): | ||||||
|     _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/[^/]+/(?:player|live)/(?P<video_id>[a-zA-Z0-9]+)(?:/(?P<display_id>[^/?#]+))?' |     _VALID_URL = r'https://(?:beta|www)\.ardmediathek\.de/(?P<client>[^/]+)/(?:player|live)/(?P<video_id>[a-zA-Z0-9]+)(?:/(?P<display_id>[^/?#]+))?' | ||||||
|     _TESTS = [{ |     _TESTS = [{ | ||||||
|         'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita', |         'url': 'https://beta.ardmediathek.de/ard/player/Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE/die-robuste-roswita', | ||||||
|         'md5': '2d02d996156ea3c397cfc5036b5d7f8f', |         'md5': 'dfdc87d2e7e09d073d5a80770a9ce88f', | ||||||
|         'info_dict': { |         'info_dict': { | ||||||
|             'display_id': 'die-robuste-roswita', |             'display_id': 'die-robuste-roswita', | ||||||
|             'id': 'Y3JpZDovL2Rhc2Vyc3RlLmRlL3RhdG9ydC9mYmM4NGM1NC0xNzU4LTRmZGYtYWFhZS0wYzcyZTIxNGEyMDE', |             'id': '70153354', | ||||||
|             'title': 'Tatort: Die robuste Roswita', |             'title': 'Die robuste Roswita', | ||||||
|             'description': r're:^Der Mord.*trüber ist als die Ilm.', |             'description': r're:^Der Mord.*trüber ist als die Ilm.', | ||||||
|             'duration': 5316, |             'duration': 5316, | ||||||
|             'thumbnail': 'https://img.ardmediathek.de/standard/00/55/43/59/34/-1774185891/16x9/960?mandant=ard', |             'thumbnail': 'https://img.ardmediathek.de/standard/00/70/15/33/90/-1852531467/16x9/960?mandant=ard', | ||||||
|             'upload_date': '20180826', |             'timestamp': 1577047500, | ||||||
|  |             'upload_date': '20191222', | ||||||
|             'ext': 'mp4', |             'ext': 'mp4', | ||||||
|         }, |         }, | ||||||
|     }, { |     }, { | ||||||
| @@ -330,71 +338,68 @@ class ARDBetaMediathekIE(InfoExtractor): | |||||||
|         video_id = mobj.group('video_id') |         video_id = mobj.group('video_id') | ||||||
|         display_id = mobj.group('display_id') or video_id |         display_id = mobj.group('display_id') or video_id | ||||||
|  |  | ||||||
|         webpage = self._download_webpage(url, display_id) |         player_page = self._download_json( | ||||||
|         data_json = self._search_regex(r'window\.__APOLLO_STATE__\s*=\s*(\{.*);\n', webpage, 'json') |             'https://api.ardmediathek.de/public-gateway', | ||||||
|         data = self._parse_json(data_json, display_id) |             display_id, data=json.dumps({ | ||||||
|  |                 'query': '''{ | ||||||
|         res = { |   playerPage(client:"%s", clipId: "%s") { | ||||||
|             'id': video_id, |     blockedByFsk | ||||||
|             'display_id': display_id, |     broadcastedOn | ||||||
|  |     maturityContentRating | ||||||
|  |     mediaCollection { | ||||||
|  |       _duration | ||||||
|  |       _geoblocked | ||||||
|  |       _isLive | ||||||
|  |       _mediaArray { | ||||||
|  |         _mediaStreamArray { | ||||||
|  |           _quality | ||||||
|  |           _server | ||||||
|  |           _stream | ||||||
|         } |         } | ||||||
|         formats = [] |       } | ||||||
|         subtitles = {} |       _previewImage | ||||||
|         geoblocked = False |       _subtitleUrl | ||||||
|         for widget in data.values(): |       _type | ||||||
|             if widget.get('_geoblocked') is True: |     } | ||||||
|                 geoblocked = True |     show { | ||||||
|             if '_duration' in widget: |       title | ||||||
|                 res['duration'] = int_or_none(widget['_duration']) |     } | ||||||
|             if 'clipTitle' in widget: |     synopsis | ||||||
|                 res['title'] = widget['clipTitle'] |     title | ||||||
|             if '_previewImage' in widget: |     tracking { | ||||||
|                 res['thumbnail'] = widget['_previewImage'] |       atiCustomVars { | ||||||
|             if 'broadcastedOn' in widget: |         contentId | ||||||
|                 res['timestamp'] = unified_timestamp(widget['broadcastedOn']) |       } | ||||||
|             if 'synopsis' in widget: |     } | ||||||
|                 res['description'] = widget['synopsis'] |   } | ||||||
|             subtitle_url = url_or_none(widget.get('_subtitleUrl')) | }''' % (mobj.group('client'), video_id), | ||||||
|             if subtitle_url: |             }).encode(), headers={ | ||||||
|                 subtitles.setdefault('de', []).append({ |                 'Content-Type': 'application/json' | ||||||
|                     'ext': 'ttml', |             })['data']['playerPage'] | ||||||
|                     'url': subtitle_url, |         title = player_page['title'] | ||||||
|  |         content_id = str_or_none(try_get( | ||||||
|  |             player_page, lambda x: x['tracking']['atiCustomVars']['contentId'])) | ||||||
|  |         media_collection = player_page.get('mediaCollection') or {} | ||||||
|  |         if not media_collection and content_id: | ||||||
|  |             media_collection = self._download_json( | ||||||
|  |                 'https://www.ardmediathek.de/play/media/' + content_id, | ||||||
|  |                 content_id, fatal=False) or {} | ||||||
|  |         info = self._parse_media_info( | ||||||
|  |             media_collection, content_id or video_id, | ||||||
|  |             player_page.get('blockedByFsk')) | ||||||
|  |         age_limit = None | ||||||
|  |         description = player_page.get('synopsis') | ||||||
|  |         maturity_content_rating = player_page.get('maturityContentRating') | ||||||
|  |         if maturity_content_rating: | ||||||
|  |             age_limit = int_or_none(maturity_content_rating.lstrip('FSK')) | ||||||
|  |         if not age_limit: | ||||||
|  |             age_limit = int_or_none(self._search_regex(r'\(FSK\s*(\d+)\)\s*$', description, 'age limit', default=None)) | ||||||
|  |         info.update({ | ||||||
|  |             'age_limit': age_limit, | ||||||
|  |             'display_id': display_id, | ||||||
|  |             'title': title, | ||||||
|  |             'description': description, | ||||||
|  |             'timestamp': unified_timestamp(player_page.get('broadcastedOn')), | ||||||
|  |             'series': try_get(player_page, lambda x: x['show']['title']), | ||||||
|         }) |         }) | ||||||
|             if '_quality' in widget: |         return info | ||||||
|                 format_url = url_or_none(try_get( |  | ||||||
|                     widget, lambda x: x['_stream']['json'][0])) |  | ||||||
|                 if not format_url: |  | ||||||
|                     continue |  | ||||||
|                 ext = determine_ext(format_url) |  | ||||||
|                 if ext == 'f4m': |  | ||||||
|                     formats.extend(self._extract_f4m_formats( |  | ||||||
|                         format_url + '?hdcore=3.11.0', |  | ||||||
|                         video_id, f4m_id='hds', fatal=False)) |  | ||||||
|                 elif ext == 'm3u8': |  | ||||||
|                     formats.extend(self._extract_m3u8_formats( |  | ||||||
|                         format_url, video_id, 'mp4', m3u8_id='hls', |  | ||||||
|                         fatal=False)) |  | ||||||
|                 else: |  | ||||||
|                     # HTTP formats are not available when geoblocked is True, |  | ||||||
|                     # other formats are fine though |  | ||||||
|                     if geoblocked: |  | ||||||
|                         continue |  | ||||||
|                     quality = str_or_none(widget.get('_quality')) |  | ||||||
|                     formats.append({ |  | ||||||
|                         'format_id': ('http-' + quality) if quality else 'http', |  | ||||||
|                         'url': format_url, |  | ||||||
|                         'preference': 10,  # Plain HTTP, that's nice |  | ||||||
|                     }) |  | ||||||
|  |  | ||||||
|         if not formats and geoblocked: |  | ||||||
|             self.raise_geo_restricted( |  | ||||||
|                 msg='This video is not available due to geoblocking', |  | ||||||
|                 countries=['DE']) |  | ||||||
|  |  | ||||||
|         self._sort_formats(formats) |  | ||||||
|         res.update({ |  | ||||||
|             'subtitles': subtitles, |  | ||||||
|             'formats': formats, |  | ||||||
|         }) |  | ||||||
|  |  | ||||||
|         return res |  | ||||||
|   | |||||||
| @@ -1,14 +1,14 @@ | |||||||
| # coding: utf-8 | # coding: utf-8 | ||||||
| from __future__ import unicode_literals | from __future__ import unicode_literals | ||||||
|  |  | ||||||
| from .ard import ARDMediathekIE | from .ard import ARDMediathekBaseIE | ||||||
| from ..utils import ( | from ..utils import ( | ||||||
|     ExtractorError, |     ExtractorError, | ||||||
|     get_element_by_attribute, |     get_element_by_attribute, | ||||||
| ) | ) | ||||||
|  |  | ||||||
|  |  | ||||||
| class SRMediathekIE(ARDMediathekIE): | class SRMediathekIE(ARDMediathekBaseIE): | ||||||
|     IE_NAME = 'sr:mediathek' |     IE_NAME = 'sr:mediathek' | ||||||
|     IE_DESC = 'Saarländischer Rundfunk' |     IE_DESC = 'Saarländischer Rundfunk' | ||||||
|     _VALID_URL = r'https?://sr-mediathek(?:\.sr-online)?\.de/index\.php\?.*?&id=(?P<id>[0-9]+)' |     _VALID_URL = r'https?://sr-mediathek(?:\.sr-online)?\.de/index\.php\?.*?&id=(?P<id>[0-9]+)' | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Remita Amine
					Remita Amine