Since LMDB cannot be pickled, an error, ...can't pickle Environment Object..., occurs when we naively implement LMDB into data.dataset while wrapping data.DataLoader with Distributed Data Parallel(DDP).

To resolve the error, we need to delay the loading of the LMDB environment in data.dataset;

# This code is modified from https://raw.githubusercontent.com/rmccorm4/PyTorch-LMDB/master

class my_dataset_LMDB(data.Dataset):
    def __init__(self, db_path, file_path) 
        self.db_path = db_path
        self.file_path = file_path

        # Delay loading LMDB data until after initialization to avoid "can't pickle Environment Object error"
        self.env = None
        self.txn = None

    def _init_db(self):
        self.env = lmdb.open(self.db_path, subdir=os.path.isdir(self.db_path),
            readonly=True, lock=False,
            readahead=False, meminit=False)
        self.txn = self.env.begin()

    def read_lmdb(self, key):
        lmdb_data = self.txn.get(key.encode())
        lmdb_data = np.frombuffer(lmdb_data)

        return lmdb_data

    def __getitem__(self, index):
        # Delay loading LMDB data until after initialization
        if self.env is None:
            self._init_db()

        file_name = self.file_paths[index]
        data = self.read_lmdb(file_name)
        ...

Tip. The option lock=False for lmdb.open(...) fixes the error MDB_READERS_FULL: Environment maxreaders limit reached.


References

  1. PyTorch-LMDB

Leave a comment