[abcnews] Added a new extractor (closes #3992)

Related: #6108, #8664, #9459
2016-05-17 15:38:57 +08:00 · 2016-05-17 15:38:57 +08:00 · 055f0d3d06
commit 055f0d3d06
parent cdd94c2eae
3 changed files with 141 additions and 2 deletions
--- a/youtube_dl/extractor/abcnews.py
+++ b/youtube_dl/extractor/abcnews.py
@ -0,0 +1,135 @@
 # coding: utf-8
 from __future__ import unicode_literals
 import calendar
 import re
 import time
 from .amp import AMPIE
 from .common import InfoExtractor
 from ..compat import compat_urlparse
 class AbcNewsVideoIE(AMPIE):
    IE_NAME = 'abcnews:video'
    _VALID_URL = 'http://abcnews.go.com/[^/]+/video/(?P<display_id>[0-9a-z-]+)-(?P<id>\d+)'
    _TESTS = [{
        'url': 'http://abcnews.go.com/ThisWeek/video/week-exclusive-irans-foreign-minister-zarif-20411932',
        'info_dict': {
            'id': '20411932',
            'ext': 'mp4',
            'display_id': 'week-exclusive-irans-foreign-minister-zarif',
            'title': '\'This Week\' Exclusive: Iran\'s Foreign Minister Zarif',
            'description': 'George Stephanopoulos goes one-on-one with Iranian Foreign Minister Dr. Javad Zarif.',
            'duration': 180,
            'thumbnail': 're:^https?://.*\.jpg$',
        },
        'params': {
            # m3u8 download
            'skip_download': True,
        },
    }, {
        'url': 'http://abcnews.go.com/2020/video/2020-husband-stands-teacher-jail-student-affairs-26119478',
        'only_matching': True,
    }]
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        display_id = mobj.group('display_id')
        video_id = mobj.group('id')
        info_dict = self._extract_feed_info(
            'http://abcnews.go.com/video/itemfeed?id=%s' % video_id)
        info_dict.update({
            'id': video_id,
            'display_id': display_id,
        })
        return info_dict
 class AbcNewsIE(InfoExtractor):
    IE_NAME = 'abcnews'
    _VALID_URL = 'https?://abcnews\.go\.com/(?:[^/]+/)+(?P<display_id>[0-9a-z-]+)/story\?id=(?P<id>\d+)'
    _TESTS = [{
        'url': 'http://abcnews.go.com/Blotter/News/dramatic-video-rare-death-job-america/story?id=10498713#.UIhwosWHLjY',
        'info_dict': {
            'id': '10498713',
            'ext': 'flv',
            'display_id': 'dramatic-video-rare-death-job-america',
            'title': 'Occupational Hazards',
            'description': 'Nightline investigates the dangers that lurk at various jobs.',
            'thumbnail': 're:^https?://.*\.jpg$',
            'upload_date': '20100428',
            'timestamp': 1272412800,
        },
        'add_ie': ['AbcNewsVideo'],
    }, {
        'url': 'http://abcnews.go.com/Entertainment/justin-timberlake-performs-stop-feeling-eurovision-2016/story?id=39125818',
        'info_dict': {
            'id': '39125818',
            'ext': 'mp4',
            'display_id': 'justin-timberlake-performs-stop-feeling-eurovision-2016',
            'title': 'Justin Timberlake Drops Hints For Secret Single',
            'description': 'Lara Spencer reports the buzziest stories of the day in "GMA" Pop News.',
            'upload_date': '20160515',
            'timestamp': 1463329500,
        },
        'params': {
            # m3u8 download
            'skip_download': True,
            # The embedded YouTube video is blocked due to copyright issues
            'playlist_items': '1',
        },
        'add_ie': ['AbcNewsVideo'],
    }, {
        'url': 'http://abcnews.go.com/Technology/exclusive-apple-ceo-tim-cook-iphone-cracking-software/story?id=37173343',
        'only_matching': True,
    }]
    def _real_extract(self, url):
        mobj = re.match(self._VALID_URL, url)
        display_id = mobj.group('display_id')
        video_id = mobj.group('id')
        webpage = self._download_webpage(url, video_id)
        video_url = self._search_regex(
            r'window\.abcnvideo\.url\s*=\s*"([^"]+)"', webpage, 'video URL')
        full_video_url = compat_urlparse.urljoin(url, video_url)
        youtube_url = self._html_search_regex(
            r'<iframe[^>]+src="(https://www\.youtube\.com/embed/[^"]+)"',
            webpage, 'YouTube URL', default=None)
        timestamp = None
        date_str = self._html_search_regex(
            r'<span[^>]+class="timestamp">([^<]+)</span>',
            webpage, 'timestamp', fatal=False)
        if date_str:
            tz_offset = 0
            if date_str.endswith(' ET'):  # Eastern Time
                tz_offset = -5
                date_str = date_str[:-3]
            date_formats = ['%b. %d, %Y', '%b %d, %Y, %I:%M %p']
            for date_format in date_formats:
                try:
                    timestamp = calendar.timegm(time.strptime(date_str.strip(), date_format))
                except ValueError:
                    continue
            if timestamp is not None:
                timestamp -= tz_offset * 3600
        entry = {
            '_type': 'url_transparent',
            'ie_key': AbcNewsVideoIE.ie_key(),
            'url': full_video_url,
            'id': video_id,
            'display_id': display_id,
            'timestamp': timestamp,
        }
        if youtube_url:
            entries = [entry, self.url_result(youtube_url, 'Youtube')]
            return self.playlist_result(entries)
        return entry
--- a/youtube_dl/extractor/amp.py
+++ b/youtube_dl/extractor/amp.py
@ -52,7 +52,7 @@ class AMPIE(InfoExtractor):
        for media_data in media_content:
            media = media_data['@attributes']
            media_type = media['type']
-            if media_type == 'video/f4m':
+            if media_type in ('video/f4m', 'application/f4m+xml'):
                formats.extend(self._extract_f4m_formats(
                    media['url'] + '?hdcore=3.4.0&plugin=aasp-3.4.0.132.124',
                    video_id, f4m_id='hds', fatal=False))
@ -61,7 +61,7 @@ class AMPIE(InfoExtractor):
                    media['url'], video_id, 'mp4', m3u8_id='hls', fatal=False))
            else:
                formats.append({
-                    'format_id': media_data['media-category']['@attributes']['label'],
+                    'format_id': media_data.get('media-category', {}).get('@attributes', {}).get('label'),
                    'url': media['url'],
                    'tbr': int_or_none(media.get('bitrate')),
                    'filesize': int_or_none(media.get('fileSize')),
--- a/youtube_dl/extractor/extractors.py
+++ b/youtube_dl/extractor/extractors.py
@ -3,6 +3,10 @@ from __future__ import unicode_literals
 from .abc import ABCIE
 from .abc7news import Abc7NewsIE
 from .abcnews import (
    AbcNewsIE,
    AbcNewsVideoIE,
 )
 from .academicearth import AcademicEarthCourseIE
 from .acast import (
    ACastIE,