Commit f42b6739 authored by Michael Salim's avatar Michael Salim
Browse files

Init Postgres backend works; added "balsam which"; Can run sqlite/Postgres...

Init Postgres backend works; added "balsam which"; Can run sqlite/Postgres backends from "balsam dbserver"; Upped timeouts/queue depths for ZMQ Sqlite server
parent 91279287
import argparse
from importlib.util import find_spec
import glob
import getpass
import os
import sys
import signal
......@@ -18,26 +19,46 @@ PYTHON = sys.executable
SQLITE_SERVER = find_spec('balsam.django_config.sqlite_server').origin
DB_COMMANDS = {
'sqlite3' : f'{PYTHON} {SQLITE_SERVER}',
'postgres': f'',
'postgres': f'pg_ctl -D {{pg_db_path}} -w start',
'mysql' : f'',
}
term_start = 0
def run(cmd):
proc = subprocess.Popen(cmd, shell=True, stdout=subprocess.DEVNULL,
stderr=subprocess.STDOUT)
return proc
def stop(proc):
proc.terminate()
def stop(proc, serverinfo):
print("Balsam server shutdown...", flush=True)
try: retcode = proc.wait(timeout=30)
except subprocess.TimeoutExpired:
print("Warning: server did not quit gracefully")
proc.kill()
if serverinfo['db_type'] == 'postgres':
cmd = f'pg_ctl -D {{pg_db_path}} -w stop'.format(**serverinfo.data)
print(cmd)
proc = subprocess.Popen(cmd, shell=True)
time.sleep(2)
else:
proc.terminate()
try: retcode = proc.wait(timeout=30)
except subprocess.TimeoutExpired:
print("Warning: server did not quit gracefully")
proc.kill()
def wait(proc, serverinfo):
if serverinfo['db_type'] == 'sqlite3':
retcode = proc.wait(timeout=CHECK_PERIOD)
elif serverinfo['db_type'] == 'postgres':
time.sleep(CHECK_PERIOD)
user = getpass.getuser()
proc = subprocess.Popen('ps aux | grep {user} | grep postgres | '
'grep -v grep', shell=True, stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
stdout, _ = proc.communicate()
lines = stdout.decode('utf-8').split('\n')
if len(lines) >= 1: raise subprocess.TimeoutExpired('cmd', CHECK_PERIOD)
def main(db_path):
serverinfo = ServerInfo(db_path)
......@@ -46,13 +67,14 @@ def main(db_path):
db_cmd = f"BALSAM_DB_PATH={db_path} " + DB_COMMANDS[server_type].format(**serverinfo.data)
print(f"\nStarting balsam DB server daemon for DB at {db_path}")
print(db_cmd)
proc = run(db_cmd)
# On SIGUSR1, stop immediately ("balsam server --stop" does this)
def handle_stop(signum, stack):
stop(proc)
serverinfo.update({'address': None})
stop(proc, serverinfo)
serverinfo.update({'address': None, 'host':None,'port':None})
sys.exit(0)
signal.signal(signal.SIGINT, handle_stop)
......@@ -61,17 +83,18 @@ def main(db_path):
while not term_start or time.time() - term_start < TERM_LINGER:
try:
retcode = proc.wait(timeout=CHECK_PERIOD)
wait(proc, serverinfo)
except subprocess.TimeoutExpired:
pass
else:
print("\nserver process stopped unexpectedly; restarting")
serverinfo.reset_server_address()
db_cmd = f"BALSAM_DB_PATH={db_path} " + DB_COMMANDS[server_type].format(**serverinfo.data)
print(db_cmd)
proc = run(db_cmd)
stop(proc)
serverinfo.update({'address': None})
stop(proc, serverinfo)
serverinfo.update({'address': None, 'host':None,'port':None})
if __name__ == "__main__":
input_path = sys.argv[1] if len(sys.argv) == 2 else None
......
......@@ -24,6 +24,13 @@ class ServerInfo:
' daemon did not have a clean shutdown.\n Use "balsam'
' dbserver --reset <balsam_db_directory>" to reset the server file'
)
if self.data.get('host') and os.environ.get('IS_SERVER_DAEMON')=='True':
raise RuntimeError(f"A running server address is already posted at {self.path}\n"
' Use "balsam dbserver --stop" to shut it down.\n'
' If you are sure there is no running server process, the'
' daemon did not have a clean shutdown.\n Use "balsam'
' dbserver --reset <balsam_db_directory>" to reset the server file'
)
def get_free_port_and_address(self):
hostname = socket.gethostname()
......@@ -50,13 +57,32 @@ class ServerInfo:
def get_postgres_info(self):
hostname = socket.gethostname()
port = self.get_free_port()
info = dict(host=hostname, port=port)
pg_db_path = os.path.join(self['balsamdb_path'], 'balsamdb')
info = dict(host=hostname, port=port, pg_db_path=pg_db_path)
return info
def update_sqlite3_config(self):
pass
def update_postgres_config(self):
conf_path = os.path.join(self['pg_db_path'], 'postgresql.conf')
config = open(conf_path).read()
with open(f"{conf_path}.new", 'w') as fp:
for line in config.split('\n'):
if line.startswith('port'):
port_line = f"port={self['port']} # auto-set by balsam db\n"
fp.write(port_line)
else:
fp.write(line + "\n")
os.rename(f"{conf_path}.new", conf_path)
def reset_server_address(self):
db = self['db_type']
info = getattr(self, f'get_{db}_info')()
self.update(info)
getattr(self, f'update_{db}_config')()
def update(self, update_dict):
self.refresh()
......
......@@ -58,7 +58,7 @@ def configure_db_backend(db_path):
db = dict(ENGINE=ENGINES[db_type], NAME=db_name,
OPTIONS=OPTIONS[db_type], USER=user, PASSWORD=password,
HOST=host, PORT=port)
HOST=host, PORT=port, CONN_MAX_AGE=60)
DATABASES = {'default':db}
return DATABASES
......@@ -120,21 +120,21 @@ LOGGING = {
'backupCount': LOG_BACKUP_COUNT,
'formatter': 'standard',
},
'django': {
'level': LOG_HANDLER_LEVEL,
'class':'logging.handlers.RotatingFileHandler',
'filename': os.path.join(LOGGING_DIRECTORY, 'django.log'),
'maxBytes': LOG_FILE_SIZE_LIMIT,
'backupCount': LOG_BACKUP_COUNT,
'formatter': 'standard',
},
#'django': {
# 'level': LOG_HANDLER_LEVEL,
# 'class':'logging.handlers.RotatingFileHandler',
# 'filename': os.path.join(LOGGING_DIRECTORY, 'django.log'),
# 'maxBytes': LOG_FILE_SIZE_LIMIT,
# 'backupCount': LOG_BACKUP_COUNT,
# 'formatter': 'standard',
#},
},
'loggers': {
'django': {
'handlers': ['django'],
'level': 'DEBUG',
'propagate': True,
},
#'django': {
# 'handlers': ['django'],
# 'level': 'DEBUG',
# 'propagate': True,
#},
'balsam': {
'handlers': ['default'],
'level': 'DEBUG',
......
......@@ -10,7 +10,7 @@ from concurrency.exceptions import RecordModifiedError
# These are ridiculously high to benchmark
# Should be more like 5-10 sec, 3-4 retry
REQ_TIMEOUT = 60000 # 60 seconds
REQ_TIMEOUT = 300000 # 5 minutes
REQ_RETRY = 56
......@@ -23,7 +23,7 @@ class Client:
self.first_message = True
if self.serverAddr:
try:
response = self.send_request('TEST_ALIVE', timeout=3000)
response = self.send_request('TEST_ALIVE', timeout=30000)
except:
raise RuntimeError("Cannot reach server at {self.serverAddr}")
else:
......
......@@ -34,7 +34,12 @@ class ZMQServer:
self.address = self.info['address']
port = int(self.address.split(':')[2])
self.context = zmq.Context(1)
self.context = zmq.Context(4)
self.context.setsockopt(zmq.BACKLOG, 32768)
self.context.setsockopt(zmq.SNDHWM, 32768)
self.context.setsockopt(zmq.RCVHWM, 32768)
self.context.setsockopt(zmq.SNDBUF, 1000000000)
self.context.setsockopt(zmq.RCVBUF, 1000000000)
self.socket = self.context.socket(zmq.REP)
self.socket.bind(f'tcp://*:{port}')
logger.info(f"db_writer bound to socket @ {self.address}")
......
......@@ -4,7 +4,7 @@ import argparse
import sys
from balsam.scripts.cli_commands import newapp,newjob,newdep,ls,modify,rm,qsub
from balsam.scripts.cli_commands import kill,mkchild,launcher,service,make_dummies
from balsam.scripts.cli_commands import dbserver, init
from balsam.scripts.cli_commands import dbserver, init, which
def main():
parser = make_parser()
......@@ -335,6 +335,11 @@ def make_parser():
parser_dummy = subparsers.add_parser('make_dummies')
parser_dummy.add_argument('num', type=int)
parser_dummy.set_defaults(func=make_dummies)
# WHICH
# ---------
parser_which = subparsers.add_parser('which')
parser_which.set_defaults(func=which)
return parser
......
......@@ -231,11 +231,11 @@ def rm(args):
# Are we removing jobs or apps?
if objects_name.startswith('job'): cls = Job
elif objects_name.startswith('app'): cls = AppDef
objects = cls.objects.all()
objects = cls.objects
# Filter: all objects, by name-match (multiple), or by ID (unique)?
if deleteall:
deletion_objs = objects
deletion_objs = objects.all()
message = f"ALL {objects_name}"
elif name:
deletion_objs = objects.filter(name__icontains=name)
......@@ -259,10 +259,8 @@ def rm(args):
return
# Actually delete things here
for obj in deletion_objs:
msg = f"Deleted {objects_name[:-1]} {obj.cute_id}"
obj.delete()
print(msg)
deletion_objs.delete()
print("Deleted.")
def qsub(args):
......@@ -371,7 +369,7 @@ def dbserver(args):
sys.exit(0)
else:
info = serverinfo.ServerInfo(args.reset)
info.update({'address': None})
info.update({'address': None, 'host':None, 'port':None})
print("Reset done")
sys.exit(0)
......@@ -380,10 +378,10 @@ def dbserver(args):
if not server_pids:
print(f"No db_daemon processes running under {getpass.getuser()}")
else:
assert len(server_pids) == 1
pid = server_pids[0]
print(f"Stopping db_daemon {pid}")
os.kill(pid, signal.SIGUSR1)
assert len(server_pids) >= 1
for pid in server_pids:
print(f"Stopping db_daemon {pid}")
os.kill(pid, signal.SIGUSR1)
else:
path = args.path
if path: cmd = [sys.executable, fname, path]
......@@ -415,6 +413,13 @@ def init(args):
p.wait()
def which(args):
os.environ['DJANGO_SETTINGS_MODULE'] = 'balsam.django_config.settings'
django.setup()
from django.conf import settings
import pprint
pprint.pprint(settings.DATABASES['default'])
def make_dummies(args):
os.environ['DJANGO_SETTINGS_MODULE'] = 'balsam.django_config.settings'
django.setup()
......
from getpass import getuser
import os
import sys
from pprint import pprint
import time
import subprocess
from balsam.django_config.serverinfo import ServerInfo
......@@ -14,12 +15,21 @@ def postgres_init(serverInfo):
p = subprocess.Popen(f'initdb -D {db_path} -U $USER', shell=True)
retcode = p.wait()
if retcode != 0: raise RuntimeError("initdb failed")
with open(os.path.join(db_path, 'postgresql.conf'), 'a') as fp:
fp.write("listen_addresses = '*' # appended from balsam init\n")
fp.write('port=0 # appended from balsam init\n')
fp.write('max_connections=128 # appended from balsam init\n')
fp.write('shared_buffers=2GB # appended from balsam init\n')
fp.write('synchronous_commit=off # appended from balsam init\n')
fp.write('wal_writer_delay=400ms # appended from balsam init\n')
with open(os.path.join(db_path, 'pg_hba.conf'), 'a') as fp:
fp.write(f"host all all 0.0.0.0/0 trust\n")
serverInfo.update({'user' : getuser()})
serverInfo.reset_server_address()
port = serverInfo['port']
with open(os.path.join(db_path, 'postgresql.conf'), 'a') as fp:
fp.write(f'port={port} # appended from balsam init\n')
serv_proc = subprocess.Popen(f'pg_ctl -D {db_path} -w start', shell=True)
time.sleep(2)
......@@ -31,8 +41,8 @@ def postgres_post(serverInfo):
db_path = serverInfo['balsamdb_path']
db_path = os.path.join(db_path, 'balsamdb')
serv_proc = subprocess.Popen(f'pg_ctl -D {db_path} -w stop', shell=True)
serv_proc.wait()
time.sleep(1)
serverInfo.update({'host':None, 'port':None})
def run_migrations():
import django
......@@ -45,16 +55,21 @@ def run_migrations():
print(f"DB settings:", settings.DATABASES['default'])
db_path = db.connection.settings_dict['NAME']
print(f"Setting up new balsam database: {db_path}")
db_info = db.connection.settings_dict['NAME']
print(f"Setting up new balsam database:")
pprint(db_info, width=60)
call_command('makemigrations', interactive=False, verbosity=0)
call_command('migrate', interactive=False, verbosity=0)
new_path = settings.DATABASES['default']['NAME']
if os.path.exists(new_path):
print(f"Set up new DB at {new_path}")
try:
from balsam.service.models import BalsamJob
j = BalsamJob()
j.save()
j.delete()
except:
raise RuntimeError("BalsamJob table not properly created")
else:
raise RuntimeError(f"Failed to created DB at {new_path}")
print("BalsamJob table created successfully")
if __name__ == "__main__":
serverInfo = ServerInfo(sys.argv[1])
......@@ -70,3 +85,4 @@ if __name__ == "__main__":
run_migrations()
if db_type == 'postgres':
postgres_post(serverInfo)
print("OK")
......@@ -12,7 +12,6 @@ class BalsamTestCase(unittest.TestCase):
@classmethod
def setUpClass(cls):
test_db_path = os.environ['BALSAM_DB_PATH']
assert test_db_path in db.connection.settings_dict['NAME']
assert 'test' in test_db_path
call_command('makemigrations',interactive=False,verbosity=0)
......@@ -29,9 +28,6 @@ class BalsamTestCase(unittest.TestCase):
pass # to be implemented by test cases
def tearDown(self):
test_db_path = os.environ['BALSAM_DB_PATH']
if not test_db_path in db.connection.settings_dict['NAME']:
raise RuntimeError("Test DB not configured")
call_command('flush',interactive=False,verbosity=0)
......
......@@ -19,7 +19,10 @@ class TestInsertion(BalsamTestCase):
self.launcherInfo = util.launcher_info()
max_workers = self.launcherInfo.num_workers
worker_counts = takewhile(lambda x: x<=max_workers, (2**i for i in range(20)))
worker_counts = list(takewhile(lambda x: x<=max_workers, (2**i for i in range(20))))
if max_workers not in worker_counts:
worker_counts.append(max_workers)
worker_counts = list(reversed(worker_counts))
#ranks_per_node = [4, 8, 16, 32]
ranks_per_node = [32]
self.experiments = product(worker_counts, ranks_per_node)
......
# BENCHMARK: test_concurrent_mpi_insert
# Host: thetamom1
# COBALT_BLOCKNAME: 2810-2813,2816,3171,3178-3179,4253-4255,4318,4408-4409,4446,4579
# COBALT_PARTNAME: 2810-2813,2816,3171,3178-3179,4253-4255,4318,4408-4409,4446,4579
# COBALT_JOBID: 181696
# COBALT_PARTSIZE: 16
# COBALT_NODEFILE: /var/tmp/cobalt.181696
# COBALT_JOBSIZE: 16
# COBALT_BLOCKSIZE: 16
# Each rank simultaneously calls dag.add_job (num_ranks simultaneous insertions)
# measure total time for entire aprun (including all aprun/python overheads)
# db_writer is running on thetalogin6, aprun from thetamom1
# num_nodes ranks_per_node num_ranks total_time_sec
# --------------------------------------------------------------
16 32 512 62.640
16 16 256 38.550
16 8 128 25.280
16 4 64 19.750
8 32 256 45.760
8 16 128 26.060
8 8 64 18.790
8 4 32 15.000
4 32 128 34.560
4 16 64 21.290
4 8 32 16.400
4 4 16 13.780
2 32 64 28.300
2 16 32 19.300
2 8 16 15.060
2 4 8 13.410
1 32 32 25.250
1 16 16 17.720
1 8 8 14.610
1 4 4 13.060
# BENCHMARK: test_concurrent_mpi_insert
# Host: thetamom1
# COBALT_PARTCORES: 64
# COBALT_ENDTIME: 1517592363
# COBALT_QUEUE: debug-cache-quad
# COBALT_PARTNAME: 3828,3835-3837
# COBALT_STARTTIME: 1517591763
# COBALT_JOBID: 182446
# COBALT_PARTSIZE: 4
# COBALT_JOBSIZE: 4
# COBALT_PROJECT: datascience
# Each rank simultaneously calls dag.add_job (num_ranks simultaneous insertions)
# num_nodes ranks_per_node num_ranks total_time_sec
# --------------------------------------------------------------
1 32 32 29.490
2 32 64 33.620
4 32 128 35.900
# BENCHMARK: test_concurrent_mpi_insert
# Host: thetamom1
# COBALT_PARTCORES: 64
# COBALT_ENDTIME: 1517870456
# COBALT_QUEUE: default
# COBALT_PARTNAME: 60-69,200-209,404,500-503,505,507-509,632-633,636-639,788-789,830-831,834-839,920-929,1030,1032,1038,1101,1103,1108,1212-1213,1216-1218,1370-1371,1373,1376,1450,1452,1454-1459,1532,1534,1570-1577,1580-1586,1588-1589,1720-1721,1724-1729,1820-1829,1920-1929,1986-1987,2090-2099,2140-2145,2184-2185,2188,2231-2235,2237-2239,2411,2416,2433,2439,2520-2525,2527-2529,2700-2703,2708-2709,2742,2745,2890-2899,2950-2951,3008-3009,3024,3069,3087,3111,3190-3194,3199,3223,3270-3272,3275,3278-3279,3310-3313,3315-3319,3330-3332,3334-3335,3361,3471-3472,3476-3479,3490,3492-3494,3496-3499,3520-3526,3577,3596,3652-3657,3669,3692-3694,3696-3697
# COBALT_STARTTIME: 1517867756
# COBALT_JOBID: 184114
# COBALT_PARTSIZE: 256
# COBALT_JOBSIZE: 256
# COBALT_PROJECT: datascience
# ENGINE: django.db.backends.sqlite3
# NAME: /home/msalim/hpc-edge-service/testdb/db.sqlite3
# OPTIONS: {'timeout': 5000}
# USER:
# PASSWORD:
# HOST: None
# PORT: None
# ATOMIC_REQUESTS: False
# AUTOCOMMIT: True
# CONN_MAX_AGE: 0
# TIME_ZONE: None
# TEST: {'CHARSET': None, 'COLLATION': None, 'NAME': None, 'MIRROR': None}
# Each rank simultaneously calls dag.add_job (num_ranks simultaneous insertions)
# num_nodes ranks_per_node num_ranks total_time_sec
# --------------------------------------------------------------
1 32 32 30.920
2 32 64 29.290
4 32 128 32.520
8 32 256 36.330
16 32 512 45.540
32 32 1024 70.520
64 32 2048 205.150
128 32 4096 308.800
# BENCHMARK: test_concurrent_mpi_insert
# Host: thetamom2
# COBALT_BLOCKNAME: 3830,3832,3836
# COBALT_PARTNAME: 3830,3832,3836
# COBALT_JOBID: 184028
# COBALT_PARTSIZE: 3
# COBALT_NODEFILE: /var/tmp/cobalt.184028
# COBALT_JOBSIZE: 3
# COBALT_BLOCKSIZE: 3
# ENGINE: django.db.backends.postgresql_psycopg2
# NAME: balsam
# OPTIONS: {}
# USER: msalim
# PASSWORD:
# HOST: thetalogin4
# PORT: 35700
# ATOMIC_REQUESTS: False
# AUTOCOMMIT: True
# CONN_MAX_AGE: 0
# TIME_ZONE: None
# TEST: {'CHARSET': None, 'COLLATION': None, 'NAME': None, 'MIRROR': None}
# Each rank simultaneously calls dag.add_job (num_ranks simultaneous insertions)
# num_nodes ranks_per_node num_ranks total_time_sec
# --------------------------------------------------------------
1 32 32 24.130
2 32 64 22.990
3 32 96 25.130
# 8 Theta compute nodes running "ranks" total mpi ranks
# (ranks-per-node is "ranks" / 8")
# mpi4py program: all ranks simultaneously insert new job thru django ORM
# measure total time for entire aprun (including all aprun/python overheads)
# db_writer is running on thetalogin6, aprun from thetamom1, COBALT_PARTNAME=3824-3827,3829,3833,3838-3839
# db_writer is a simple zeromq REP/REQ service listening on tcp://*:5556
#ranks time(seconds)
8 11.773
16 12.772
32 13.165
64 15.289
128 18.242
160 19.835
192 23.247
224 24.152
256 26.073
304 29.941
336 31.295
368 35.807
400 36.037
432 39.181
448 39.928
480 41.017
512 44.229
# same as insertion_summary BUT db_writer is running on MOM node
# 8 Theta compute nodes running "ranks" total mpi ranks
# (ranks-per-node is "ranks" / 8")
# mpi4py program: all ranks simultaneously insert new job thru django ORM
# measure total time for entire aprun (including all aprun/python overheads)
# db_writer is running on thetamom1, aprun from thetamom1, COBALT_PARTNAME=3824-3827,3829,3833,3838-3839
# db_writer is a simple zeromq REP/REQ service listening on tcp://*:5556
#ranks time(seconds)
8 11.998s
16 13.249s
32 13.901s
64 15.932s
128 20.803s
160 23.651s
192 26.026s
224 29.982s
256 31.177s
304 35.458s
336 37.431s
368 40.119s
400 42.260s
432 45.614s
448 45.678s
480 49.881s
512 52.189s
#!/bin/bash -x
#COBALT -A datascience
#COBALT -n 256
#COBALT -n 1024
#COBALT -q default
#COBALT -t 45
#COBALT -M msalim@anl.gov
......
......@@ -158,12 +158,15 @@ class FormatTable:
self.rows.append(row)
def create_header(self, title, comment):