[camdemy] Simplify and make more robust (#4938)
Do not throw errors if view count or upload date extraction fails. Dispose of re.MULTILINE, which had absolutely no effect without any ^ or $ in sight. Follow PEP8 naming conventions.
This commit is contained in:
parent
024c53694d
commit
08b38d5401
@ -1,11 +1,18 @@
|
|||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
from __future__ import unicode_literals
|
from __future__ import unicode_literals
|
||||||
|
|
||||||
|
import datetime
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from .common import InfoExtractor
|
from .common import InfoExtractor
|
||||||
from ..compat import (compat_urllib_parse, compat_urlparse)
|
from ..compat import (
|
||||||
from ..utils import parse_iso8601
|
compat_urllib_parse,
|
||||||
|
compat_urlparse,
|
||||||
|
)
|
||||||
|
from ..utils import (
|
||||||
|
parse_iso8601,
|
||||||
|
str_to_int,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class CamdemyIE(InfoExtractor):
|
class CamdemyIE(InfoExtractor):
|
||||||
@ -23,6 +30,7 @@ class CamdemyIE(InfoExtractor):
|
|||||||
'creator': 'ss11spring',
|
'creator': 'ss11spring',
|
||||||
'upload_date': '20130114',
|
'upload_date': '20130114',
|
||||||
'timestamp': 1358154556,
|
'timestamp': 1358154556,
|
||||||
|
'view_count': int,
|
||||||
}
|
}
|
||||||
}, {
|
}, {
|
||||||
# With non-empty description
|
# With non-empty description
|
||||||
@ -55,46 +63,43 @@ class CamdemyIE(InfoExtractor):
|
|||||||
|
|
||||||
def _real_extract(self, url):
|
def _real_extract(self, url):
|
||||||
video_id = self._match_id(url)
|
video_id = self._match_id(url)
|
||||||
|
|
||||||
page = self._download_webpage(url, video_id)
|
page = self._download_webpage(url, video_id)
|
||||||
|
|
||||||
srcFrom = self._html_search_regex(
|
src_from = self._html_search_regex(
|
||||||
r"<div class='srcFrom'>Source: <a title='([^']+)'", page,
|
r"<div class='srcFrom'>Source: <a title='([^']+)'", page,
|
||||||
'external source', default=None)
|
'external source', default=None)
|
||||||
|
if src_from:
|
||||||
if srcFrom:
|
return self.url_result(src_from)
|
||||||
return self.url_result(srcFrom)
|
|
||||||
|
|
||||||
oembed_obj = self._download_json(
|
oembed_obj = self._download_json(
|
||||||
'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id)
|
'http://www.camdemy.com/oembed/?format=json&url=' + url, video_id)
|
||||||
|
|
||||||
thumb_url = oembed_obj['thumbnail_url']
|
thumb_url = oembed_obj['thumbnail_url']
|
||||||
video_folder = compat_urlparse.urljoin(thumb_url, 'video/')
|
video_folder = compat_urlparse.urljoin(thumb_url, 'video/')
|
||||||
fileListXML = self._download_xml(
|
file_list_doc = self._download_xml(
|
||||||
compat_urlparse.urljoin(video_folder, 'fileList.xml'),
|
compat_urlparse.urljoin(video_folder, 'fileList.xml'),
|
||||||
video_id, 'Filelist XML')
|
video_id, 'Filelist XML')
|
||||||
fileName = fileListXML.find('./video/item/fileName').text
|
file_name = file_list_doc.find('./video/item/fileName').text
|
||||||
|
video_url = compat_urlparse.urljoin(video_folder, file_name)
|
||||||
|
|
||||||
creation_time = self._html_search_regex(
|
timestamp = parse_iso8601(self._html_search_regex(
|
||||||
r"<div class='title'>Posted :</div>[\r\n ]*<div class='value'>([^<>]+)<",
|
r"<div class='title'>Posted\s*:</div>\s*<div class='value'>([^<>]+)<",
|
||||||
page, 'creation time', flags=re.MULTILINE) + '+08:00'
|
page, 'creation time', fatal=False),
|
||||||
creation_timestamp = parse_iso8601(creation_time, delimiter=' ')
|
delimiter=' ', timezone=datetime.timedelta(hours=8))
|
||||||
|
view_count = str_to_int(self._html_search_regex(
|
||||||
view_count_str = self._html_search_regex(
|
r"<div class='title'>Views\s*:</div>\s*<div class='value'>([^<>]+)<",
|
||||||
r"<div class='title'>Views :</div>[\r\n ]*<div class='value'>([^<>]+)<",
|
page, 'view count', fatal=False))
|
||||||
page, 'view count', flags=re.MULTILINE)
|
|
||||||
views = int(view_count_str.replace(',', ''))
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
'id': video_id,
|
'id': video_id,
|
||||||
'url': compat_urlparse.urljoin(video_folder, fileName),
|
'url': video_url,
|
||||||
'title': oembed_obj['title'],
|
'title': oembed_obj['title'],
|
||||||
'thumbnail': thumb_url,
|
'thumbnail': thumb_url,
|
||||||
'description': self._html_search_meta('description', page),
|
'description': self._html_search_meta('description', page),
|
||||||
'creator': oembed_obj['author_name'],
|
'creator': oembed_obj['author_name'],
|
||||||
'duration': oembed_obj['duration'],
|
'duration': oembed_obj['duration'],
|
||||||
'timestamp': creation_timestamp,
|
'timestamp': timestamp,
|
||||||
'view_count': views,
|
'view_count': view_count,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -666,26 +666,27 @@ class YoutubeDLHTTPSHandler(compat_urllib_request.HTTPSHandler):
|
|||||||
req, **kwargs)
|
req, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def parse_iso8601(date_str, delimiter='T'):
|
def parse_iso8601(date_str, delimiter='T', timezone=None):
|
||||||
""" Return a UNIX timestamp from the given date """
|
""" Return a UNIX timestamp from the given date """
|
||||||
|
|
||||||
if date_str is None:
|
if date_str is None:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
m = re.search(
|
if timezone is None:
|
||||||
r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
|
m = re.search(
|
||||||
date_str)
|
r'(\.[0-9]+)?(?:Z$| ?(?P<sign>\+|-)(?P<hours>[0-9]{2}):?(?P<minutes>[0-9]{2})$)',
|
||||||
if not m:
|
date_str)
|
||||||
timezone = datetime.timedelta()
|
if not m:
|
||||||
else:
|
|
||||||
date_str = date_str[:-len(m.group(0))]
|
|
||||||
if not m.group('sign'):
|
|
||||||
timezone = datetime.timedelta()
|
timezone = datetime.timedelta()
|
||||||
else:
|
else:
|
||||||
sign = 1 if m.group('sign') == '+' else -1
|
date_str = date_str[:-len(m.group(0))]
|
||||||
timezone = datetime.timedelta(
|
if not m.group('sign'):
|
||||||
hours=sign * int(m.group('hours')),
|
timezone = datetime.timedelta()
|
||||||
minutes=sign * int(m.group('minutes')))
|
else:
|
||||||
|
sign = 1 if m.group('sign') == '+' else -1
|
||||||
|
timezone = datetime.timedelta(
|
||||||
|
hours=sign * int(m.group('hours')),
|
||||||
|
minutes=sign * int(m.group('minutes')))
|
||||||
date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
|
date_format = '%Y-%m-%d{0}%H:%M:%S'.format(delimiter)
|
||||||
dt = datetime.datetime.strptime(date_str, date_format) - timezone
|
dt = datetime.datetime.strptime(date_str, date_format) - timezone
|
||||||
return calendar.timegm(dt.timetuple())
|
return calendar.timegm(dt.timetuple())
|
||||||
|
Loading…
Reference in New Issue
Block a user