Download files from google drive and upload to S3
This repo contains a python tool to download files from a google drive folder, and to then upload the files to S3.
Features
- Downloads the file to an in memory file handle and uploads from there without using precious disk space.
- Operates on one file at a time.
- Only speed limitation is network bandwith
- Downloads can be picked up from where you left off using the paging feature.
- Can take a file of known filenames and only upload files from google drive that match those.
First time authentication
- You will need to create a google drive app client for use with this script. You do this in your google API console.
- You will need to download the client secret file and call it
client_secret.json
. place it in the same folder as the script. - On first run you’ll be asked to authenticate the app and allow it full access to your drive (needs this in order to access files shared with you)
Usage
This tool has built in help.
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 |
➜ python download-from-google-drive.py -h usage: download-from-google-drive.py [-h] [--auth_host_name AUTH_HOST_NAME] [--noauth_local_webserver] [--auth_host_port [AUTH_HOST_PORT [AUTH_HOST_PORT ...]]] [--logging_level {DEBUG,INFO,WARNING,ERROR,CRITICAL}] --folder_id FOLDER_ID --bucket BUCKET --key-prefix KEY_PREFIX [--page-size PAGE_SIZE] [--start-page START_PAGE] [--end-page END_PAGE] [--match-file MATCH_FILE] optional arguments: -h, --help show this help message and exit --auth_host_name AUTH_HOST_NAME Hostname when running a local web server. --noauth_local_webserver Do not run a local web server. --auth_host_port [AUTH_HOST_PORT [AUTH_HOST_PORT ...]] Port web server should listen on. --logging_level {DEBUG,INFO,WARNING,ERROR,CRITICAL} Set the logging level of detail. --folder_id FOLDER_ID, -f FOLDER_ID Google Drive Folder ID (it's the end of the folder URI!) --bucket BUCKET, -b BUCKET Name of S3 bucket to use --key-prefix KEY_PREFIX, -k KEY_PREFIX Key prefix to use (path to a folder) --page-size PAGE_SIZE, -p PAGE_SIZE Number of files in each page --start-page START_PAGE, -s START_PAGE start from page N of the file listing --end-page END_PAGE, -e END_PAGE stop paging at page N of the file listing --match-file MATCH_FILE Only process files if the filename is in this file |
A typical command to download all files and upload to S3 would be:
1 2 |
python download-from-google-drive.py -f idofthegooglefolder -b my-bucket -k path/to/files/in/bucket |
A typical command to download only files which match a supplied checklist of files and upload to S3:
The checklist file contains a filename one per line.
1 2 |
python download-from-google-drive.py -f idofthegooglefolder -b my-bucket -k path/to/files/in/bucket --match-file checklist_file.txt |
You may need to process thousands of files but only want to work on them in distinct batches so that you can pick up where you left off.
The script defaults to 100 files per page, but this can be adjusted. This example processes a list of files in a google drive folder paging 10 at a time, starting at page 20 and ending at page 30.
1 |
python download-from-google-drive.py -f idofthegooglefolder -b my-bucket -k path/to/files/in/bucket -s 20 -e 30 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 |
from __future__ import print_function import datetime import time import httplib2 import os import sys from apiclient import discovery import oauth2client from oauth2client import client from oauth2client import tools from logbook import Logger, FileHandler, StreamHandler log = Logger('copy-google-drive-folder') try: import argparse flags = argparse.ArgumentParser(parents=[tools.argparser]) # add in our specific command line requirements flags.add_argument('--source-folder_id', '-f', type=str, required=True, help="Source Google Drive Folder ID (it's the end of the folder URI!) (required)") flags.add_argument('--target-folder_id', '-t', type=str, required=True, help="Target Google Drive Folder ID (it's the end of the folder URI!) (required)") flags.add_argument('--page-size', '-p', type=int, default=100, help="Number of files in each page (defaults to 100)") flags.add_argument('--start-page', '-s', type=int, default=1, help="start from page N of the file listing (defaults to 1)") flags.add_argument('--end-page', '-e', type=int, default=None, help="stop paging at page N of the file listing (defaults to not stop before the end)") flags.add_argument('--log-dir', '-l', type=str, help='Where to put log files', default='/tmp') flags.add_argument('--log-level', type=str, help='Choose a log level', default='INFO') args = flags.parse_args() except ImportError: flags = None # If modifying these scopes, delete your previously saved credentials # at ~/.credentials/drive-python-quickstart.json # SCOPES = 'https://www.googleapis.com/auth/drive.metadata.readonly' SCOPES = 'https://www.googleapis.com/auth/drive' CLIENT_SECRET_FILE = 'client_secret.json' APPLICATION_NAME = 'Copy Google Drive Folders' def get_credentials(): """Gets valid user credentials from storage. If nothing has been stored, or if the stored credentials are invalid, the OAuth2 flow is completed to obtain the new credentials. Returns: Credentials, the obtained credential. """ home_dir = os.path.expanduser('~') credential_dir = os.path.join(home_dir, '.credentials') if not os.path.exists(credential_dir): os.makedirs(credential_dir) credential_path = os.path.join(credential_dir, 'drive-copy-google-folders.json') store = oauth2client.file.Storage(credential_path) credentials = store.get() if not credentials or credentials.invalid: flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES) flow.user_agent = APPLICATION_NAME if flags: credentials = tools.run_flow(flow, store, args) else: # Needed only for compatibility with Python 2.6 credentials = tools.run(flow, store) log.info('Storing credentials to ' + credential_path) return credentials def ensure_trailing_slash(val): if val[-1] != '/': return "{}/".format(val) return val def main(): """ Copy a folder from Source to Target """ log_filename = os.path.join( args.log_dir, 'copy-google-drive-folder-{}.log'.format(os.path.basename(time.strftime('%Y%m%d-%H%M%S'))) ) # register some logging handlers log_handler = FileHandler( log_filename, mode='w', level=args.log_level, bubble=True ) stdout_handler = StreamHandler(sys.stdout, level=args.log_level, bubble=True) with stdout_handler.applicationbound(): with log_handler.applicationbound(): log.info("Arguments: {}".format(args)) start = time.time() log.info("starting at {}".format(time.strftime('%l:%M%p %Z on %b %d, %Y'))) credentials = get_credentials() http = credentials.authorize(httplib2.Http()) drive_service = discovery.build('drive', 'v3', http=http) # get the files in the specified folder. files = drive_service.files() request = files.list( pageSize=args.page_size, q="'{}' in parents".format(args.source_folder_id), fields="nextPageToken, files(id, name, mimeType)" ) page_counter = 0 file_counter = 0 while request is not None: file_page = request.execute(http=http) page_counter += 1 page_file_counter = 0 # reset the paging file counter # determine the page at which to start processing. if page_counter >= args.start_page: log.info(u"######## Page {} ########".format(page_counter)) for this_file in file_page['files']: file_counter += 1 page_file_counter += 1 log.info(u"#== Processing {} {} file number {} on page {}. {} files processed.".format( this_file['mimeType'], this_file['name'], page_file_counter, page_counter, file_counter )) # if not a folder if this_file['mimeType'] != 'application/vnd.google-apps.folder': # Copy the file new_file = {'title': this_file['name']} copied_file = drive_service.files().copy(fileId=this_file['id'], body=new_file).execute() # move it to it's new location drive_service.files().update( fileId=copied_file['id'], addParents=args.target_folder_id, removeParents=args.source_folder_id ).execute() else: log.info(u"Skipped Folder") else: log.info(u"Skipping Page {}".format(page_counter)) # stop if we have come to the last user specified page if args.end_page and page_counter == args.end_page: log.info(u"Finished paging at page {}".format(page_counter)) break # request the next page of files request = files.list_next(request, file_page) log.info("Running time: {}".format(str(datetime.timedelta(seconds=(round(time.time() - start, 3)))))) log.info("Log written to {}:".format(log_filename)) if __name__ == '__main__': main() |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 |
from __future__ import print_function from googleapiclient.http import MediaIoBaseDownload import httplib2 import os import sys import datetime import time from apiclient import discovery import io import oauth2client from oauth2client import client from oauth2client import tools from logbook import Logger, FileHandler, StreamHandler from progress_bar import InitBar import boto3 log = Logger('google-drive-to-s3') try: import argparse flags = argparse.ArgumentParser(parents=[tools.argparser]) # add in our specific command line requirements flags.add_argument('--folder_id', '-f', type=str, required=True, help="Google Drive Folder ID (it's the end of the folder URI!) (required)") flags.add_argument('--bucket', '-b', type=str, required=True, help="Name of S3 bucket to use (required)") flags.add_argument('--key-prefix', '-k', type=str, required=True, help="Key prefix to use as the path to a folder in S3 (required)") flags.add_argument('--page-size', '-p', type=int, default=100, help="Number of files in each page (defaults to 100)") flags.add_argument('--start-page', '-s', type=int, default=1, help="start from page N of the file listing (defaults to 1)") flags.add_argument('--end-page', '-e', type=int, default=None, help="stop paging at page N of the file listing (defaults to not stop before the end)") flags.add_argument('--match-file', type=str, default=None, help="Only process files if the filename is in this file (defaults to process all files)") flags.add_argument('--log-dir', '-l', type=str, help='Where to put log files', default='/tmp') flags.add_argument('--log-level', type=str, help='Choose a log level', default='INFO') args = flags.parse_args() except ImportError: flags = None # If modifying these scopes, delete your previously saved credentials # at ~/.credentials/drive-python-quickstart.json # SCOPES = 'https://www.googleapis.com/auth/drive.metadata.readonly' SCOPES = 'https://www.googleapis.com/auth/drive' CLIENT_SECRET_FILE = 'client_secret.json' APPLICATION_NAME = 'Transfer from Google Drive to S3' def get_credentials(): """Gets valid user credentials from storage. If nothing has been stored, or if the stored credentials are invalid, the OAuth2 flow is completed to obtain the new credentials. Returns: Credentials, the obtained credential. """ home_dir = os.path.expanduser('~') credential_dir = os.path.join(home_dir, '.credentials') if not os.path.exists(credential_dir): os.makedirs(credential_dir) credential_path = os.path.join(credential_dir, 'download-from-google-drive-to-s3.json') store = oauth2client.file.Storage(credential_path) credentials = store.get() if not credentials or credentials.invalid: flow = client.flow_from_clientsecrets(CLIENT_SECRET_FILE, SCOPES) flow.user_agent = APPLICATION_NAME if flags: credentials = tools.run_flow(flow, store, args) else: # Needed only for compatibility with Python 2.6 credentials = tools.run(flow, store) print('Storing credentials to ' + credential_path) return credentials def ensure_trailing_slash(val): if val[-1] != '/': return "{}/".format(val) return val def we_should_process_this_file(filename, match_files): if not match_files: # We have not supplied any file names to match against, so process everything. return True if filename in match_files: return True return False def main(): """Shows basic usage of the Google Drive API. Creates a Google Drive API service object and outputs the names and IDs for up to 10 files. """ log_filename = os.path.join( args.log_dir, 'google-drive-to-s3-{}.log'.format(os.path.basename(time.strftime('%Y%m%d-%H%M%S'))) ) # register some logging handlers log_handler = FileHandler( log_filename, mode='w', level=args.log_level, bubble=True ) stdout_handler = StreamHandler(sys.stdout, level=args.log_level, bubble=True) with stdout_handler.applicationbound(): with log_handler.applicationbound(): log.info("Arguments: {}".format(args)) start = time.time() log.info("starting at {}".format(time.strftime('%l:%M%p %Z on %b %d, %Y'))) credentials = get_credentials() http = credentials.authorize(httplib2.Http()) drive_service = discovery.build('drive', 'v3', http=http) s3 = boto3.resource('s3') # load up a match file if we have one. if args.match_file: with open(args.match_file, 'r') as f: match_filenames = f.read().splitlines() else: match_filenames = None # get the files in the specified folder. files = drive_service.files() request = files.list( pageSize=args.page_size, q="'{}' in parents".format(args.folder_id), fields="nextPageToken, files(id, name)" ) # make sure our S3 Key prefix has a trailing slash key_prefix = ensure_trailing_slash(args.key_prefix) page_counter = 0 file_counter = 0 while request is not None: file_page = request.execute(http=http) page_counter += 1 page_file_counter = 0 # reset the paging file counter # determine the page at which to start processing. if page_counter >= args.start_page: log.info(u"######## Page {} ########".format(page_counter)) for this_file in file_page['files']: file_counter += 1 page_file_counter += 1 if we_should_process_this_file(this_file['name'], match_filenames): log.info(u"#== Processing {} file number {} on page {}. {} files processed.".format( this_file['name'], page_file_counter, page_counter, file_counter )) # download the file download_request = drive_service.files().get_media(fileId=this_file['id']) fh = io.BytesIO() # Using an in memory stream location downloader = MediaIoBaseDownload(fh, download_request) done = False pbar = InitBar(this_file['name']) while done is False: status, done = downloader.next_chunk() pbar(int(status.progress()*100)) # print("\rDownload {}%".format(int(status.progress() * 100))) del pbar # upload to bucket log.info(u"Uploading to S3") s3.Bucket(args.bucket).put_object( Key="{}{}".format(key_prefix, this_file['name']), Body=fh.getvalue(), ACL='public-read' ) log.info(u"Uploaded to S3") fh.close() # close the file handle to release memory else: log.info(u"Do not need to process {}".format(this_file['name'])) # stop if we have come to the last user specified page if args.end_page and page_counter == args.end_page: log.info(u"Finished paging at page {}".format(page_counter)) break # request the next page of files request = files.list_next(request, file_page) log.info("Running time: {}".format(str(datetime.timedelta(seconds=(round(time.time() - start, 3)))))) log.info("Log written to {}:".format(log_filename)) if __name__ == '__main__': main() |