| @@ -296,9 +296,11 @@ class InfoExtractor(object): | |||||||
|         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal) |         content = self._webpage_read_content(urlh, url_or_request, video_id, note, errnote, fatal) | ||||||
|         return (content, urlh) |         return (content, urlh) | ||||||
|  |  | ||||||
|     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True): |     def _webpage_read_content(self, urlh, url_or_request, video_id, note=None, errnote=None, fatal=True, prefix=None): | ||||||
|         content_type = urlh.headers.get('Content-Type', '') |         content_type = urlh.headers.get('Content-Type', '') | ||||||
|         webpage_bytes = urlh.read() |         webpage_bytes = urlh.read() | ||||||
|  |         if prefix is not None: | ||||||
|  |             webpage_bytes = prefix + webpage_bytes | ||||||
|         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) |         m = re.match(r'[a-zA-Z0-9_.-]+/[a-zA-Z0-9_.-]+\s*;\s*charset=(.+)', content_type) | ||||||
|         if m: |         if m: | ||||||
|             encoding = m.group(1) |             encoding = m.group(1) | ||||||
|   | |||||||
| @@ -452,7 +452,23 @@ class GenericIE(InfoExtractor): | |||||||
|                 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse', |                 'title': 'Unity 8 desktop-mode windows on Mir! - Ubuntu Discourse', | ||||||
|             }, |             }, | ||||||
|             'playlist_mincount': 2, |             'playlist_mincount': 2, | ||||||
|  |         }, | ||||||
|  |         # Direct link with incorrect MIME type | ||||||
|  |         { | ||||||
|  |             'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', | ||||||
|  |             'md5': '4ccbebe5f36706d85221f204d7eb5913', | ||||||
|  |             'info_dict': { | ||||||
|  |                 'url': 'http://ftp.nluug.nl/video/nluug/2014-11-20_nj14/zaal-2/5_Lennart_Poettering_-_Systemd.webm', | ||||||
|  |                 'id': '5_Lennart_Poettering_-_Systemd', | ||||||
|  |                 'ext': 'webm', | ||||||
|  |                 'title': '5_Lennart_Poettering_-_Systemd', | ||||||
|  |                 'upload_date': '20141120', | ||||||
|  |             }, | ||||||
|  |             'expected_warnings': [ | ||||||
|  |                 'URL could be a direct video link, returning it as such.' | ||||||
|  |             ] | ||||||
|         } |         } | ||||||
|  |  | ||||||
|     ] |     ] | ||||||
|  |  | ||||||
|     def report_following_redirect(self, new_url): |     def report_following_redirect(self, new_url): | ||||||
| @@ -606,10 +622,28 @@ class GenericIE(InfoExtractor): | |||||||
|         if not self._downloader.params.get('test', False) and not is_intentional: |         if not self._downloader.params.get('test', False) and not is_intentional: | ||||||
|             self._downloader.report_warning('Falling back on generic information extractor.') |             self._downloader.report_warning('Falling back on generic information extractor.') | ||||||
|  |  | ||||||
|         if full_response: |         if not full_response: | ||||||
|             webpage = self._webpage_read_content(full_response, url, video_id) |             full_response = self._request_webpage(url, video_id) | ||||||
|         else: |  | ||||||
|             webpage = self._download_webpage(url, video_id) |         # Maybe it's a direct link to a video? | ||||||
|  |         # Be careful not to download the whole thing! | ||||||
|  |         first_bytes = full_response.read(512) | ||||||
|  |         if not re.match(r'^\s*<', first_bytes.decode('utf-8', 'replace')): | ||||||
|  |             self._downloader.report_warning( | ||||||
|  |                 'URL could be a direct video link, returning it as such.') | ||||||
|  |             upload_date = unified_strdate( | ||||||
|  |                 head_response.headers.get('Last-Modified')) | ||||||
|  |             return { | ||||||
|  |                 'id': video_id, | ||||||
|  |                 'title': os.path.splitext(url_basename(url))[0], | ||||||
|  |                 'direct': True, | ||||||
|  |                 'url': url, | ||||||
|  |                 'upload_date': upload_date, | ||||||
|  |             } | ||||||
|  |  | ||||||
|  |         webpage = self._webpage_read_content( | ||||||
|  |             full_response, url, video_id, prefix=first_bytes) | ||||||
|  |  | ||||||
|         self.report_extraction(video_id) |         self.report_extraction(video_id) | ||||||
|  |  | ||||||
|         # Is it an RSS feed? |         # Is it an RSS feed? | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Philipp Hagemeister
					Philipp Hagemeister