[youtube] improve Youtube Music Auto-generated description parsing(closes #20742)
This commit is contained in:
		| @@ -1088,7 +1088,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|             }, |             }, | ||||||
|         }, |         }, | ||||||
|         { |         { | ||||||
|             # artist and track fields should return non-null, per issue #20599 |             # Youtube Music Auto-generated description | ||||||
|             'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs', |             'url': 'https://music.youtube.com/watch?v=MgNrAu2pzNs', | ||||||
|             'info_dict': { |             'info_dict': { | ||||||
|                 'id': 'MgNrAu2pzNs', |                 'id': 'MgNrAu2pzNs', | ||||||
| @@ -1109,11 +1109,9 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|             }, |             }, | ||||||
|         }, |         }, | ||||||
|         { |         { | ||||||
|  |             # Youtube Music Auto-generated description | ||||||
|             # Retrieve 'artist' field from 'Artist:' in video description |             # Retrieve 'artist' field from 'Artist:' in video description | ||||||
|             # when it is present on youtube music video |             # when it is present on youtube music video | ||||||
|             # Some videos have release_date and no release_year - |  | ||||||
|             # (release_year should be extracted from release_date) |  | ||||||
|             # https://github.com/ytdl-org/youtube-dl/pull/20742#issuecomment-485740932 |  | ||||||
|             'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY', |             'url': 'https://www.youtube.com/watch?v=k0jLE7tTwjY', | ||||||
|             'info_dict': { |             'info_dict': { | ||||||
|                 'id': 'k0jLE7tTwjY', |                 'id': 'k0jLE7tTwjY', | ||||||
| @@ -1134,6 +1132,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|             }, |             }, | ||||||
|         }, |         }, | ||||||
|         { |         { | ||||||
|  |             # Youtube Music Auto-generated description | ||||||
|             # handle multiple artists on youtube music video |             # handle multiple artists on youtube music video | ||||||
|             'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA', |             'url': 'https://www.youtube.com/watch?v=74qn0eJSjpA', | ||||||
|             'info_dict': { |             'info_dict': { | ||||||
| @@ -1155,6 +1154,7 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|             }, |             }, | ||||||
|         }, |         }, | ||||||
|         { |         { | ||||||
|  |             # Youtube Music Auto-generated description | ||||||
|             # handle youtube music video with release_year and no release_date |             # handle youtube music video with release_year and no release_date | ||||||
|             'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M', |             'url': 'https://www.youtube.com/watch?v=-hcAI0g-f5M', | ||||||
|             'info_dict': { |             'info_dict': { | ||||||
| @@ -2161,36 +2161,25 @@ class YoutubeIE(YoutubeBaseInfoExtractor): | |||||||
|  |  | ||||||
|         track = extract_meta('Song') |         track = extract_meta('Song') | ||||||
|         artist = extract_meta('Artist') |         artist = extract_meta('Artist') | ||||||
|         album = None |  | ||||||
|         release_date = None |  | ||||||
|         release_year = None |  | ||||||
|  |  | ||||||
|         description_info = video_description.split('\n\n') |         # Youtube Music Auto-generated description | ||||||
|         # If the description of the video has the youtube music auto-generated format, extract additional info |         album = release_date = release_year = None | ||||||
|         if len(description_info) >= 5 and description_info[-1] == 'Auto-generated by YouTube.': |         if video_description: | ||||||
|             track_artist = description_info[1].split(' · ') |             mobj = re.search(r'(?s)Provided to YouTube by [^\n]+\n+(?P<track>[^·]+)·(?P<artist>[^\n]+)\n+(?P<album>[^\n]+)(?:.+?℗\s*(?P<release_year>\d{4})(?!\d))?(?:.+?Released on\s*:\s*(?P<release_date>\d{4}-\d{2}-\d{2}))?(.+?\nArtist\s*:\s*(?P<clean_artist>[^\n]+))?', video_description) | ||||||
|             if len(track_artist) >= 2: |             if mobj: | ||||||
|                 if track is None: |                 if not track: | ||||||
|                     track = track_artist[0] |                     track = mobj.group('track').strip() | ||||||
|                 if artist is None: |                 if not artist: | ||||||
|                     artist = re.search(r'Artist: ([^\n]+)', description_info[-2]) |                     artist = mobj.group('clean_artist') or ', '.join(a.strip() for a in mobj.group('artist').split('·')) | ||||||
|                     if artist: |                 album = mobj.group('album'.strip()) | ||||||
|                         artist = artist.group(1) |                 release_year = mobj.group('release_year') | ||||||
|                     if artist is None: |                 release_date = mobj.group('release_date') | ||||||
|                         artist = track_artist[1] |                 if release_date: | ||||||
|                         # handle multiple artists |                     release_date = release_date.replace('-', '') | ||||||
|                         if len(track_artist) > 2: |                     if not release_year: | ||||||
|                             for i in range(2, len(track_artist)): |                         release_year = int(release_date[:4]) | ||||||
|                                 artist += ', %s' % track_artist[i] |                 if release_year: | ||||||
|             release_year = re.search(r'℗ ([0-9]+)', video_description) |                     release_year = int(release_year) | ||||||
|             if release_year: |  | ||||||
|                 release_year = int_or_none(release_year.group(1)) |  | ||||||
|             album = description_info[2] |  | ||||||
|             if description_info[4].startswith('Released on: '): |  | ||||||
|                 release_date = description_info[4].split(': ')[1].replace('-', '') |  | ||||||
|                 # extract release_year from release_date if necessary |  | ||||||
|                 if release_year is None: |  | ||||||
|                     release_year = int_or_none(release_date[0:4]) |  | ||||||
|  |  | ||||||
|         m_episode = re.search( |         m_episode = re.search( | ||||||
|             r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', |             r'<div[^>]+id="watch7-headline"[^>]*>\s*<span[^>]*>.*?>(?P<series>[^<]+)</a></b>\s*S(?P<season>\d+)\s*•\s*E(?P<episode>\d+)</span>', | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user
	 Remita Amine
					Remita Amine