Commit e5aca37a authored by Rémi Duraffort's avatar Rémi Duraffort Committed by Neil Williams

LAVA-1129 save job full configuration before start

This will help reproducing the exact same run.

Also add an xmlrpc api to grab the job configuration.

Change-Id: I64798a12146064d3fb811bb28bf27f49e20fd6d5
parent b708b038
......@@ -31,6 +31,14 @@ elif sys.version_info[0] == 3:
import xmlrpc.client as xmlrpclib
def load_optional_file(filename):
try:
with open(filename, "r") as f_in:
return f_in.read().encode("utf-8")
except IOError:
return None
class SchedulerJobsAPI(ExposedV2API):
def cancel(self, job_id):
......@@ -55,6 +63,52 @@ class SchedulerJobsAPI(ExposedV2API):
cls = SchedulerAPI(self._context)
return cls.cancel_job(job_id)
def configuration(self, job_id):
"""
Name
----
`scheduler.jobs.configuration` (`job_id`)
Description
-----------
Return the full job configuration
Arguments
---------
`job_id`: string
Job id
Return value
------------
Return an array with [job, device, dispatcher, env, env-dut] config.
Any of theses values might be None if the corresponding file hasn't
been used by the job.
If the job hasn't started yet, a 404 error will be returned.
"""
try:
job = TestJob.get_by_job_number(job_id)
except TestJob.DoesNotExist:
raise xmlrpclib.Fault(
404, "Job '%s' was not found." % job_id)
if not job.can_view(self.user):
raise xmlrpclib.Fault(
403, "Job '%s' not available to user '%s'." %
(job_id, self.user))
if job.state not in [TestJob.STATE_RUNNING, TestJob.STATE_CANCELING, TestJob.STATE_FINISHED]:
raise xmlrpclib.Fault(
404, "Job '%s' has not started yet" % job_id)
output_dir = job.output_dir
definition = load_optional_file(os.path.join(output_dir, "job.yaml"))
device = load_optional_file(os.path.join(output_dir, "device.yaml"))
dispatcher = load_optional_file(os.path.join(output_dir,
"dispatcher.yaml"))
env = load_optional_file(os.path.join(output_dir, "env.yaml"))
env_dut = load_optional_file(os.path.join(output_dir, "env.dut.yaml"))
return [definition, device, dispatcher, env, env_dut]
def definition(self, job_id):
"""
Name
......
......@@ -280,6 +280,7 @@
{% endif %}
<li><a href="#bottom"><span class="glyphicon glyphicon-fast-forward"></span> End of log</a></li>
<li><a href="{% url 'lava.scheduler.job.log_file.plain' job.pk %}"><span class="glyphicon glyphicon-save-file"></span> Plain log</a></li>
<li><a href="{% url 'lava.scheduler.job.configuration' job.pk %}"><span class="glyphicon glyphicon-book"></span> Configuration</a></li>
<li role="separator" class="divider"></li>
<li><a href="{% url 'lava.scheduler.job.toggle_favorite' job.pk %}"><span class="glyphicon glyphicon-star{% if not is_favorite %}-empty{% endif %}"></span> {{ is_favorite|yesno:"Remove from favorites,Add to favorites" }}</a></li>
<li><a href="#" data-toggle="modal" data-target="#similar_jobs_modal"><span class="glyphicon glyphicon-search"></span> Similar jobs</a></li>
......
......@@ -13,7 +13,7 @@ from lava_scheduler_app.views import (
get_remote_definition, health_job_list, healthcheck, index,
job_annotate_failure, job_cancel, job_fail, job_change_priority, job_complete_log,
job_definition, job_definition_plain, job_description_yaml, job_detail,
job_list,
job_list, job_configuration,
job_log_file_plain, job_log_pipeline_incremental,
job_pipeline_timing, job_resubmit, job_section_log, job_status,
job_submit, job_toggle_favorite, lab_health,
......@@ -66,6 +66,8 @@ urlpatterns = [
url(r'^job/(?P<pk>[0-9]+|[0-9]+\.[0-9]+)/multinode_definition/plain$',
multinode_job_definition_plain,
name='lava.scheduler.job.multinode_definition.plain'),
url(r'^job/(?P<pk>[0-9]+|[0-9]+\.[0-9]+)/configuration$', job_configuration,
name='lava.scheduler.job.configuration'),
url(r'^job/(?P<pk>[0-9]+|[0-9]+\.[0-9]+)/log_file/plain$', job_log_file_plain,
name='lava.scheduler.job.log_file.plain'),
url(r'^job/(?P<pk>[0-9]+|[0-9]+\.[0-9]+)/timing$',
......
......@@ -4,14 +4,17 @@
from __future__ import unicode_literals
from collections import OrderedDict
import yaml
import contextlib
import datetime
import io
import jinja2
import logging
import os
import simplejson
import datetime
import re
import sys
import tarfile
import re
import yaml
from django import forms
from django.contrib.humanize.templatetags.humanize import naturaltime
......@@ -1484,6 +1487,34 @@ def job_pipeline_timing(request, pk):
content_type='text/json')
def job_configuration(request, pk):
def add_optional_file(tar, filename):
with contextlib.suppress(OSError):
tar.add(filename)
job = get_restricted_job(request.user, pk, request=request)
data = ""
pwd = os.getcwd()
try:
with contextlib.suppress(FileNotFoundError):
os.chdir(job.output_dir)
fileobj = io.BytesIO()
with tarfile.open(fileobj=fileobj, mode="w:bz2") as tar:
add_optional_file(tar, "job.yaml")
add_optional_file(tar, "device.yaml")
add_optional_file(tar, "dispatcher.yaml")
add_optional_file(tar, "env.yaml")
add_optional_file(tar, "env.dut.yaml")
fileobj.seek(0)
data = fileobj.read()
fileobj.close()
finally:
os.chdir(pwd)
response = HttpResponse(data, content_type="application/tar")
response['content-Disposition'] = "attachment; filename=configuration.tar.bz2"
return response
def job_log_file_plain(request, pk):
job = get_restricted_job(request.user, pk, request=request)
# Old style jobs
......
......@@ -28,6 +28,7 @@ import jinja2
import simplejson
import lzma
import os
import shutil
import sys
import time
import yaml
......@@ -410,6 +411,67 @@ class Command(LAVADaemonCommand):
# no need for the dispatcher to retain comments
return yaml.dump(job_def)
def save_job_config(self, job, worker, device_cfg, options):
output_dir = job.output_dir
mkdir(output_dir)
with open(os.path.join(output_dir, "job.yaml"), "w") as f_out:
f_out.write(self.export_definition(job))
with suppress(IOError):
shutil.copy(options["env"], os.path.join(output_dir, "env.yaml"))
with suppress(IOError):
shutil.copy(options["env_dut"], os.path.join(output_dir, "env.dut.yaml"))
with suppress(IOError):
shutil.copy(os.path.join(options["dispatchers_config"], "%s.yaml" % worker.hostname),
os.path.join(output_dir, "dispatcher.yaml"))
with open(os.path.join(output_dir, "device.yaml"), "w") as f_out:
yaml.dump(device_cfg, f_out)
def start_job(self, job, options):
# Load job definition to get the variables for template
# rendering
job_def = yaml.load(job.definition)
job_ctx = job_def.get('context', {})
device = job.actual_device
worker = device.worker_host
# Load configurations
env_str = load_optional_yaml_file(options['env'])
env_dut_str = load_optional_yaml_file(options['env_dut'])
device_cfg = device.load_configuration(job_ctx)
dispatcher_cfg_file = os.path.join(options['dispatchers_config'],
"%s.yaml" % worker.hostname)
dispatcher_cfg = load_optional_yaml_file(dispatcher_cfg_file)
self.save_job_config(job, worker, device_cfg, options)
self.logger.info("[%d] START => %s (%s)", job.id,
worker.hostname, device.hostname)
send_multipart_u(self.controler,
[worker.hostname, 'START', str(job.id),
self.export_definition(job),
yaml.dump(device_cfg),
dispatcher_cfg, env_str, env_dut_str])
# For multinode jobs, start the dynamic connections
parent = job
for sub_job in job.sub_jobs_list:
if sub_job == parent or not sub_job.dynamic_connection:
continue
# inherit only enough configuration for dynamic_connection operation
self.logger.info("[%d] Trimming dynamic connection device configuration.", sub_job.id)
min_device_cfg = parent.actual_device.minimise_configuration(device_cfg)
self.save_job_config(sub_job, worker, min_device_cfg, options)
self.logger.info("[%d] START => %s (connection)",
sub_job.id, worker.hostname)
send_multipart_u(self.controler,
[worker.hostname, 'START',
str(sub_job.id),
self.export_definition(sub_job),
yaml.dump(min_device_cfg), dispatcher_cfg,
env_str, env_dut_str])
def start_jobs(self, options):
"""
Loop on all scheduled jobs and send the START message to the slave.
......@@ -429,49 +491,7 @@ class Command(LAVADaemonCommand):
for job in query:
msg = None
try:
# Load job definition to get the variables for template
# rendering
job_def = yaml.load(job.definition)
job_ctx = job_def.get('context', {})
device = job.actual_device
worker = device.worker_host
# Load configurations
env_str = load_optional_yaml_file(options['env'])
env_dut_str = load_optional_yaml_file(options['env_dut'])
device_cfg = device.load_configuration(job_ctx)
dispatcher_cfg_file = os.path.join(options['dispatchers_config'],
"%s.yaml" % worker.hostname)
dispatcher_cfg = load_optional_yaml_file(dispatcher_cfg_file)
self.logger.info("[%d] START => %s (%s)", job.id,
worker.hostname, device.hostname)
send_multipart_u(self.controler,
[worker.hostname, 'START', str(job.id),
self.export_definition(job),
yaml.dump(device_cfg),
dispatcher_cfg, env_str, env_dut_str])
# For multinode jobs, start the dynamic connections
parent = job
for sub_job in job.sub_jobs_list:
if sub_job == parent or not sub_job.dynamic_connection:
continue
# inherit only enough configuration for dynamic_connection operation
self.logger.info("[%d] Trimming dynamic connection device configuration.", sub_job.id)
min_device_cfg = parent.actual_device.minimise_configuration(device_cfg)
self.logger.info("[%d] START => %s (connection)",
sub_job.id, worker.hostname)
send_multipart_u(self.controler,
[worker.hostname, 'START',
str(sub_job.id),
self.export_definition(sub_job),
yaml.dump(min_device_cfg), dispatcher_cfg,
env_str, env_dut_str])
self.start_job(job, options)
except jinja2.TemplateNotFound as exc:
self.logger.error("[%d] Template not found: '%s'",
job.id, exc.message)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment