Name: Fetch Youtube video transcripts of travel creators
Rating: 1.8 (9593 reviews)
Author: yogeba


import requests
import csv
import time
import pandas as pd

API_KEY = 'YOUR_API_KEY'  # Replace with your actual YouTube Data API key

# List of search keywords related to travel
SEARCH_KEYWORDS = [
    'travel vlog',
    'travel guide',
    'food travel',
    'adventure travel',
    'backpacking',
    'cultural travel experiences',
    'travel tips',
    'travel destinations',
    'travel blogger',
    'world travel',
    'wanderlust',
    'exploring the world',
    'travel channel',
    'travel documentary',
    'road trips'
]

# Base URL for YouTube Data API
BASE_URL = 'https://www.googleapis.com/youtube/v3'

def collect_travel_channels(api_key, search_keywords):
    """Collects travel-related YouTube channels based on search keywords."""
    channels = []
    channel_ids_set = set()

    for keyword in search_keywords:
        print(f"Searching for keyword: {keyword}")
        params = {
            'part': 'snippet',
            'q': keyword,
            'type': 'channel',
            'maxResults': 50,
            'key': api_key
        }
        response = requests.get(f"{BASE_URL}/search", params=params)
        result = response.json()

        if 'items' in result:
            for item in result['items']:
                channel_id = item['snippet']['channelId']
                if channel_id not in channel_ids_set:
                    channel_ids_set.add(channel_id)
                    channels.append({
                        'channelId': channel_id,
                        'channelTitle': item['snippet']['channelTitle'],
                        'description': item['snippet']['description']
                    })
        else:
            print(f"Error in response: {result}")

        # Pause to respect API rate limits
        time.sleep(1)

    # Save channels to CSV
    channels_df = pd.DataFrame(channels)
    channels_df.to_csv('travel_channels.csv', index=False, encoding='utf-8')
    print("Finished collecting channel data.")

def collect_channel_videos(api_key):
    """Retrieves video lists for each channel."""
    channels_df = pd.read_csv('travel_channels.csv', encoding='utf-8')
    videos = []

    for index, row in channels_df.iterrows():
        channel_id = row['channelId']
        channel_title = row['channelTitle']
        print(f"Processing channel: {channel_title} (ID: {channel_id})")

        # Get uploads playlist ID
        uploads_playlist_id = get_uploads_playlist_id(api_key, channel_id)
        if uploads_playlist_id:
            # Get videos from playlist
            channel_videos = get_videos_from_playlist(api_key, uploads_playlist_id)
            for video in channel_videos:
                videos.append({
                    'channelId': channel_id,
                    'channelTitle': channel_title,
                    'videoId': video['videoId'],
                    'videoTitle': video['videoTitle'],
                    'publishedAt': video['publishedAt']
                })
        else:
            print(f"Skipping channel {channel_title} due to missing uploads playlist.")

        # Pause between channels
        time.sleep(1)

    # Save videos to CSV
    videos_df = pd.DataFrame(videos)
    videos_df.to_csv('channel_videos.csv', index=False, encoding='utf-8')
    print("Finished collecting video data.")

def get_uploads_playlist_id(api_key, channel_id):
    """Retrieves the uploads playlist ID for a given channel."""
    params = {
        'part': 'contentDetails',
        'id': channel_id,
        'key': api_key
    }
    response = requests.get(f"{BASE_URL}/channels", params=params)
    result = response.json()

    if 'items' in result and len(result['items']) > 0:
        uploads_playlist_id = result['items'][0]['contentDetails']['relatedPlaylists']['uploads']
        return uploads_playlist_id
    else:
        print(f"Could not get uploads playlist for channel ID: {channel_id}")
        return None

def get_videos_from_playlist(api_key, playlist_id):
    """Retrieves all videos from a playlist."""
    videos = []
    params = {
        'part': 'snippet,contentDetails',
        'playlistId': playlist_id,
        'maxResults': 50,
        'key': api_key
    }

    while True:
        response = requests.get(f"{BASE_URL}/playlistItems", params=params)
        result = response.json()

        if 'items' in result:
            for item in result['items']:
                video_id = item['contentDetails']['videoId']
                video_title = item['snippet']['title']
                published_at = item['contentDetails']['videoPublishedAt']

                videos.append({
                    'videoId': video_id,
                    'videoTitle': video_title,
                    'publishedAt': published_at
                })

            if 'nextPageToken' in result:
                params['pageToken'] = result['nextPageToken']
                time.sleep(0.5)
            else:
                break
        else:
            print(f"Error retrieving videos: {result}")
            break

    return videos

def main():
    collect_travel_channels(API_KEY, SEARCH_KEYWORDS)
    collect_channel_videos(API_KEY)
    print("Data collection complete.")

if __name__ == '__main__':
    main()