Node.js & Amazon S3: How to iterate through all files in a bucket?

Is there any Amazon S3 client library for Node.js that allows listing of all files in S3 bucket?

The most known aws2js and knox don't seem to have this functionality.


Solution 1:

Using the official aws-sdk:

var allKeys = [];
function listAllKeys(marker, cb)
{
  s3.listObjects({Bucket: s3bucket, Marker: marker}, function(err, data){
    allKeys.push(data.Contents);

    if(data.IsTruncated)
      listAllKeys(data.NextMarker, cb);
    else
      cb();
  });
}

see s3.listObjects

Edit 2017: Same basic idea, but listObjectsV2( ... ) is now recommended and uses a ContinuationToken (see s3.listObjectsV2):

var allKeys = [];
function listAllKeys(token, cb)
{
  var opts = { Bucket: s3bucket };
  if(token) opts.ContinuationToken = token;

  s3.listObjectsV2(opts, function(err, data){
    allKeys = allKeys.concat(data.Contents);

    if(data.IsTruncated)
      listAllKeys(data.NextContinuationToken, cb);
    else
      cb();
  });
}

Solution 2:

Using Async Generator

Import S3

const { S3 } = require("aws-sdk");
const s3 = new S3();

create a generator function to retrieve all the files list

async function* listAllKeys(opts) {
  opts = { ...opts };
  do {
    const data = await s3.listObjectsV2(opts).promise();
    opts.ContinuationToken = data.NextContinuationToken;
    yield data;
  } while (opts.ContinuationToken);
}

Prepare aws parameter, based on api docs

const opts = {
  Bucket: "bucket-xyz" /* required */,
  // ContinuationToken: 'STRING_VALUE',
  // Delimiter: 'STRING_VALUE',
  // EncodingType: url,
  // FetchOwner: true || false,
  // MaxKeys: 'NUMBER_VALUE',
  // Prefix: 'STRING_VALUE',
  // RequestPayer: requester,
  // StartAfter: 'STRING_VALUE'
};

Use generator

async function main() {
  // using for of await loop
  for await (const data of listAllKeys(opts)) {
    console.log(data.Contents);
  }
}
main();

thats it

Or Lazy Load

async function main() {
  const keys = listAllKeys(opts);
  console.log(await keys.next());
  // {value: {…}, done: false}
  console.log(await keys.next());
  // {value: {…}, done: false}
  console.log(await keys.next());
  // {value: undefined, done: true}
}
main();

Or Use generator to make Observable function

const lister = (opts) => (o$) => {
  let needMore = true;
  const process = async () => {
    for await (const data of listAllKeys(opts)) {
      o$.next(data);
      if (!needMore) break;
    }
    o$.complete();
  };
  process();
  return () => (needMore = false);
};

use this observable function with RXJS

// Using Rxjs

const { Observable } = require("rxjs");
const { flatMap } = require("rxjs/operators");

function listAll() {
  return Observable.create(lister(opts))
    .pipe(flatMap((v) => v.Contents))
    .subscribe(console.log);
}

listAll();

or use this observable function with Nodejs EventEmitter

const EventEmitter = require("events");

const _eve = new EventEmitter();

async function onData(data) {
  // will be called for each set of data
  console.log(data);
}
async function onError(error) {
  // will be called if any error
  console.log(error);
}
async function onComplete() {
  // will be called when data completely received
}
_eve.on("next", onData);
_eve.on("error", onError);
_eve.on("complete", onComplete);

const stop = lister(opts)({
  next: (v) => _eve.emit("next", v),
  error: (e) => _eve.emit("error", e),
  complete: (v) => _eve.emit("complete", v),
});

Using Typescript and AWS-SDK v3 + Deno

import {
  paginateListObjectsV2,
  S3Client,
  S3ClientConfig,
} from "@aws-sdk/client-s3";

/* // For Deno
import {
  paginateListObjectsV2,
  S3Client,
  S3ClientConfig,
} from "https://deno.land/x/[email protected]/client-s3/mod.ts"; */

const s3Config: S3ClientConfig = {
  credentials: {
    accessKeyId: "accessKeyId",
    secretAccessKey: "secretAccessKey",
  },
  region: "us-east-1",
};

const client = new S3Client(s3Config);
const s3Opts = { Bucket: "bucket-xyz" };

async function getAllS3Files() {
  const totalFiles = [];
  for await (const data of paginateListObjectsV2({ client }, s3Opts)) {
    totalFiles.push(...(data.Contents ?? []));
  }
  return totalFiles;
}

Solution 3:

Here's Node code I wrote to assemble the S3 objects from truncated lists.

var params = {
    Bucket: <yourbucket>,
    Prefix: <yourprefix>,
};

var s3DataContents = [];    // Single array of all combined S3 data.Contents

function s3Print() {
    if (program.al) {
        // --al: Print all objects
        console.log(JSON.stringify(s3DataContents, null, "    "));
    } else {
        // --b: Print key only, otherwise also print index 
        var i;
        for (i = 0; i < s3DataContents.length; i++) {
            var head = !program.b ? (i+1) + ': ' : '';
            console.log(head + s3DataContents[i].Key);
        }
    }
}

function s3ListObjects(params, cb) {
    s3.listObjects(params, function(err, data) {
        if (err) {
            console.log("listS3Objects Error:", err);
        } else {
            var contents = data.Contents;
            s3DataContents = s3DataContents.concat(contents);
            if (data.IsTruncated) {
                // Set Marker to last returned key
                params.Marker = contents[contents.length-1].Key;
                s3ListObjects(params, cb);
            } else {
                cb();
            }
        }
    });
}

s3ListObjects(params, s3Print);

Pay attention to listObject's documentation of NextMarker, which is NOT always present in the returned data object, so I don't use it at all in the above code ...

NextMarker — (String) When response is truncated (the IsTruncated element value in the response is true), you can use the key name in this field as marker in the subsequent request to get next set of objects. Amazon S3 lists objects in alphabetical order Note: This element is returned only if you have delimiter request parameter specified. If response does not include the NextMarker and it is truncated, you can use the value of the last Key in the response as the marker in the subsequent request to get the next set of object keys.

The entire program has now been pushed to https://github.com/kenklin/s3list.

Solution 4:

In fact aws2js supports listing of objects in a bucket on a low level via s3.get() method call. To do it one has to pass prefix parameter which is documented on Amazon S3 REST API page:

var s3 = require('aws2js').load('s3', awsAccessKeyId, awsSecretAccessKey);    
s3.setBucket(bucketName);

var folder = encodeURI('some/path/to/S3/folder');
var url = '?prefix=' + folder;

s3.get(url, 'xml', function (error, data) {
    console.log(error);
    console.log(data);
});

The data variable in the above snippet contains a list of all objects in the bucketName bucket.

Solution 5:

Published knox-copy when I couldn't find a good existing solution. Wraps all the pagination details of the Rest API into a familiar node stream:

var knoxCopy = require('knox-copy');

var client = knoxCopy.createClient({
  key: '<api-key-here>',
  secret: '<secret-here>',
  bucket: 'mrbucket'
});

client.streamKeys({
  // omit the prefix to list the whole bucket
  prefix: 'buckets/of/fun' 
}).on('data', function(key) {
  console.log(key);
});

If you're listing fewer than 1000 files a single page will work:

client.listPageOfKeys({
  prefix: 'smaller/bucket/o/fun'
}, function(err, page) {
  console.log(page.Contents); // <- Here's your list of files
});