Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

# coding: utf-8 

# Copyright (c) Pymatgen Development Team. 

# Distributed under the terms of the MIT License. 

 

from __future__ import division, unicode_literals 

 

""" 

This module defines the BorgQueen class, which manages drones to assimilate 

data using Python's multiprocessing. 

""" 

 

 

__author__ = "Shyue Ping Ong" 

__copyright__ = "Copyright 2012, The Materials Project" 

__version__ = "1.0" 

__maintainer__ = "Shyue Ping Ong" 

__email__ = "shyuep@gmail.com" 

__date__ = "Mar 18, 2012" 

 

 

import os 

import json 

import logging 

 

from monty.io import zopen 

from monty.json import MontyEncoder, MontyDecoder 

 

from multiprocessing import Manager, Pool 

 

logger = logging.getLogger("BorgQueen") 

 

 

class BorgQueen(object): 

""" 

The Borg Queen controls the drones to assimilate data in an entire 

directory tree. Uses multiprocessing to speed up things considerably. It 

also contains convenience methods to save and load data between sessions. 

 

Args: 

drone (Drone): An implementation of 

:class:`pymatgen.apps.borg.hive.AbstractDrone` to use for 

assimilation. 

rootpath (str): The root directory to start assimilation. Leave it 

as None if you want to do assimilation later, or is using the 

BorgQueen to load previously assimilated data. 

ndrones (int): Number of drones to parallelize over. 

Typical machines today have up to four processors. Note that you 

won't see a 100% improvement with two drones over one, but you 

will definitely see a significant speedup of at least 50% or so. 

If you are running this over a server with far more processors, 

the speedup will be even greater. 

""" 

 

def __init__(self, drone, rootpath=None, number_of_drones=1): 

self._drone = drone 

self._num_drones = number_of_drones 

self._data = [] 

 

if rootpath: 

if number_of_drones > 1: 

self.parallel_assimilate(rootpath) 

else: 

self.serial_assimilate(rootpath) 

 

def parallel_assimilate(self, rootpath): 

""" 

Assimilate the entire subdirectory structure in rootpath. 

""" 

logger.info('Scanning for valid paths...') 

valid_paths = [] 

for (parent, subdirs, files) in os.walk(rootpath): 

valid_paths.extend(self._drone.get_valid_paths((parent, subdirs, 

files))) 

manager = Manager() 

data = manager.list() 

status = manager.dict() 

status['count'] = 0 

status['total'] = len(valid_paths) 

logger.info('{} valid paths found.'.format(len(valid_paths))) 

p = Pool(self._num_drones) 

p.map(order_assimilation, ((path, self._drone, data, status) 

for path in valid_paths)) 

for d in data: 

self._data.append(json.loads(d, cls=MontyDecoder)) 

 

def serial_assimilate(self, rootpath): 

""" 

Assimilate the entire subdirectory structure in rootpath serially. 

""" 

valid_paths = [] 

for (parent, subdirs, files) in os.walk(rootpath): 

valid_paths.extend(self._drone.get_valid_paths((parent, subdirs, 

files))) 

data = [] 

count = 0 

total = len(valid_paths) 

for path in valid_paths: 

newdata = self._drone.assimilate(path) 

self._data.append(newdata) 

count += 1 

logger.info('{}/{} ({:.2f}%) done'.format(count, total, 

count / total * 100)) 

for d in data: 

self._data.append(json.loads(d, cls=MontyDecoder)) 

 

def get_data(self): 

""" 

Returns an list of assimilated objects 

""" 

return self._data 

 

def save_data(self, filename): 

""" 

Save the assimilated data to a file. 

 

Args: 

filename (str): filename to save the assimilated data to. Note 

that if the filename ends with gz or bz2, the relevant gzip 

or bz2 compression will be applied. 

""" 

with zopen(filename, "wt") as f: 

s = json.dumps(list(self._data), f, cls=MontyEncoder) 

# This complicated for handles unicode in both Py2 and 3. 

f.write("%s" % s) 

 

def load_data(self, filename): 

""" 

Load assimilated data from a file 

""" 

with zopen(filename, "rt") as f: 

self._data = json.load(f, cls=MontyDecoder) 

 

 

def order_assimilation(args): 

""" 

Internal helper method for BorgQueen to process assimilation 

""" 

(path, drone, data, status) = args 

newdata = drone.assimilate(path) 

if newdata: 

data.append(json.dumps(newdata, cls=MontyEncoder)) 

status['count'] += 1 

count = status['count'] 

total = status['total'] 

logger.info('{}/{} ({:.2f}%) done'.format(count, total, 

count / total * 100))