%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % String Definitions %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @string{PUBLISHER_ACTA = "ACTA Press, Calgary, AB, Canada"} @string{PUBLISHER_ACADEMY = "Academy Publisher, Oulu, Finland"} @string{PUBLISHER_ACM = "ACM Press, New York, NY, USA"} @string{PUBLISHER_CSREA = "CSREA Press"} @string{PUBLISHER_INDERSCIENCE = "Inderscience Publishers, Geneve, Switzerland"} @string{PUBLISHER_IEEE_CS = "IEEE Computer Society, Los Alamitos, CA, USA"} @string{PUBLISHER_SPRINGER = "Springer Verlag, Berlin, Germany"} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Journal Publications %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @article{he07unified, author = "Xubin (Ben) He and Li Ou and Martha J. Kosa and Stephen L. Scott and Christian Engelmann", title = "A Unified Multiple-Level Cache for High Performance Cluster Storage Systems", journal = "International Journal of High Performance Computing and Networking ({IJHPCN})", volume = "5", number = "1-2", pages = "97--109", year = "2007. " # PUBLISHER_INDERSCIENCE, issn = "1740-0562", url = "http://www.csm.ornl.gov/~engelman/publications/he07unified.pdf", abstract = "Highly available data storage for high-performance computing is becoming increasingly more critical as high-end computing systems scale up in size and storage systems are developed around network-centered architectures. A promising solution is to harness the collective storage potential of individual workstations much as we harness idle CPU cycles due to the excellent price/performance ratio and low storage usage of most commodity workstations. For such a storage system, metadata consistency is a key issue assuring storage system availability as well as data reliability. In this paper, we present a decentralized metadata management scheme that improves storage availability without sacrificing performance." } @article{engelmann06symmetric, author = "Christian Engelmann and Stephen L. Scott and Chokchai (Box) Leangsuksun and Xubin (Ben) He", title = "Symmetric Active/Active High Availability for High-Performance Computing System Services", journal = "Journal of Computers ({JCP})", volume = "1", number = "8", pages = "43--54", year = "2006. " # PUBLISHER_ACADEMY, issn = "1796-203X", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann06symmetric.pdf", abstract = "This work aims to pave the way for high availability in high-performance computing (HPC) by focusing on efficient redundancy strategies for head and service nodes. These nodes represent single points of failure and control for an entire HPC system as they render it inaccessible and unmanageable in case of a failure until repair. The presented approach introduces two distinct replication methods, internal and external, for providing symmetric active/active high availability for multiple redundant head and service nodes running in virtual synchrony utilizing an existing process group communication system for service group membership management and reliable, totally ordered message delivery. Resented results of a prototype implementation that offers symmetric active/active replication for HPC job and resource management using external replication show that the highest level of availability can be provided with an acceptable performance trade-off." } @article{engelmann06molar, author = "Christian Engelmann and Stephen L. Scott and David E. Bernholdt and Narasimha R. Gottumukkala and Chokchai (Box) Leangsuksun and Jyothish Varma and Chao Wang and Frank Mueller and Aniruddha G. Shet and Ponnuswamy (Saday) Sadayappan", title = "{MOLAR}: {A}daptive Runtime Support for High-End Computing Operating and Runtime Systems", journal = "{ACM} {SIGOPS} Operating Systems Review ({OSR})", volume = "40", number = "2", pages = "63--72", year = "2006. " # PUBLISHER_ACM, issn = "0163-5980", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann06molar.pdf", abstract = "MOLAR is a multi-institutional research effort that concentrates on adaptive, reliable, and efficient operating and runtime system (OS/R) solutions for ultra-scale, high-end scientific computing on the next generation of supercomputers. This research addresses the challenges outlined in FAST-OS (forum to address scalable technology for runtime and operating systems) and HECRTF (high-end computing revitalization task force) activities by exploring the use of advanced monitoring and adaptation to improve application performance and predictability of system interruptions, and by advancing computer reliability, availability and serviceability (RAS) management systems to work cooperatively with the OS/R to identify and preemptively resolve system issues. This paper describes recent research of the MOLAR team in advancing RAS for high-end computing OS/Rs." } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Conference Publications %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @conference{wang08proactive, author = "Chao Wang and Frank Mueller and Christian Engelmann and Stephen L. Scott", title = "Proactive Process-Level Live Migration in {HPC} Environments", booktitle = "Proceedings of the {IEEE/ACM} International Conference on High Performance Computing, Networking, Storage and Analysis ({SC}) 2008", pages = "", month = nov # "~15-21, ", year = "2008", address = "Austin, TX, USA", publisher = PUBLISHER_ACM, isbn = "", url = "", note = "To appear", abstract = "" } @conference{tikotekar08analysis, author = "Anand Tikotekar and Geoffroy Vall\'ee and Thomas Naughton and Hong H. Ong and Christian Engelmann and Stephen L. Scott", title = "An Analysis of {HPC} Benchmark Applications in Virtual Machine Environments", booktitle = "Lecture Notes in Computer Science: Proceedings of the $14^{th}$ European Conference on Parallel and Distributed Computing ({Euro-Par}) 2008: $3^{rd}$ Workshop on Virtualization in High-Performance Cluster and Grid Computing ({VHPC}) 2008", volume = "", pages = "", month = aug # "~26-29, ", year = "2008", address = "Las Palmas de Gran Canaria, Spain", publisher = PUBLISHER_SPRINGER, isbn = "", url = "", note = "To appear", abstract = "" } @conference{engelmann08symmetric2, author = "Christian Engelmann and Stephen L. Scott and Chokchai (Box) Leangsuksun and Xubin (Ben) He", title = "Symmetric Active/Active High Availability for High-Performance Computing System Services: Accomplishments and Limitations", booktitle = "Proceedings of the $8^{th}$ {IEEE} International Symposium on Cluster Computing and the Grid ({CCGrid}) 2008: Workshop on Resiliency in High Performance Computing ({Resilience}) 2008", pages = "813--818", month = may # "~19-22, ", year = "2008", address = "Lyon, France", publisher = PUBLISHER_IEEE_CS, isbn = "978-0-7695-3156-4", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann08symmetric2.pdf", abstract = "This paper summarizes our efforts over the last 3-4 years in providing symmetric active/active high availability for high-performance computing (HPC) system services. This work paves the way for high-level reliability, availability and serviceability in extreme-scale HPC systems by focusing on the most critical components, head and service nodes, and by reinforcing them with appropriate high availability solutions. This paper presents our accomplishments in the form of concepts and respective prototypes, discusses existing limitations, outlines possible future work, and describes the relevance of this research to other, planned efforts." } @conference{chen08online, author = "Xin Chen and Benjamin Eckart and Xubin (Ben) He and Christian Engelmann and Stephen L. Scott", title = "An Online Controller Towards Self-Adaptive File System Availability and Performance", booktitle = "Proceedings of the $5^{th}$ High Availability and Performance Workshop ({HAPCW}) 2008, in conjunction with the $1^{st}$ High-Performance Computer Science Week ({HPCSW}) 2008", month = apr # "~3-4, ", year = "2008", address = "Denver, CO, USA", url = "http://www.csm.ornl.gov/~engelman/publications/chen08online.pdf", abstract = "At the present time, it can be a significant challenge to build a large-scale distributed file system that simultaneously maintains both high availability and high performance. Although many fault tolerance technologies have been proposed and used in both commercial and academic distributed file systems to achieve high availability, most of them typically sacrifice performance for higher system availability. Additionally, recent studies show that system availability and performance are related to the system workload. In this paper, we analyze the correlations among availability, performance, and workloads based on a replication strategy, and we discuss the trade off between availability and performance with different workloads. Our analysis leads to the design of an online controller that can dynamically achieve optimal performance and availability by tuning the system replication policy." } @conference{tikotekar08effects, author = "Anand Tikotekar and Geoffroy Vall\'ee and Thomas Naughton and Hong H. Ong and Christian Engelmann and Stephen L. Scott and Anthony M. Filippi", title = "Effects of Virtualization on a Scientific Application -- {R}unning a Hyperspectral Radiative Transfer Code on Virtual Machines", booktitle = "Proceedings of the $2^{nd}$ Workshop on System-level Virtualization for High Performance Computing ({HPCVirt}) 2008, in conjunction with the $3^{rd}$ {ACM} {SIGOPS} European Conference on Computer Systems ({EuroSys}) 2008", month = mar # "~31, ", year = "2008", address = "Glasgow, UK", url = "http://www.csm.ornl.gov/~engelman/publications/tikotekar08effects.pdf", abstract = "The topic of system-level virtualization has recently begun to receive interest for high performance computing (HPC). This is in part due to the isolation and encapsulation offered by the virtual machine. These traits enable applications to customize their environments and maintain consistent software configurations in their virtual domains. Additionally, there are mechanisms that can be used for fault tolerance like live virtual machine migration. Given these attractive benefits to virtualization, a fundamental question arises, how does this effect my scientific application? We use this as the premise for our paper and observe a real-world scientific code running on a Xen virtual machine. We studied the effects of running a radiative transfer simulation, Hydrolight, on a virtual machine. We discuss our methodology and report observations regarding the usage of virtualization with this application." } @conference{engelmann08symmetric, author = "Christian Engelmann and Stephen L. Scott and Chokchai (Box) Leangsuksun and Xubin (Ben) He", title = "Symmetric Active/Active Replication for Dependent Services", booktitle = "Proceedings of the $3^{rd}$ International Conference on Availability, Reliability and Security ({ARES}) 2008", pages = "260--267", month = mar # "~4-7, ", year = "2008", address = "Barcelona, Spain", publisher = PUBLISHER_IEEE_CS, isbn = "978-0-7695-3102-1", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann08symmetric.pdf", abstract = "During the last several years, we have established the symmetric active/active replication model for service-level high availability and implemented several proof-of-concept prototypes. One major deficiency of our model is its inability to deal with dependent services, since its original architecture is based on the client-service model. This paper extends our model to dependent services using its already existing mechanisms and features. The presented concept is based on the idea that a service may also be a client of another service, and multiple services may be clients of each other. A high-level abstraction is used to illustrate dependencies between clients and services, and to decompose dependencies between services into respective client-service dependencies. This abstraction may be used for providing high availability in distributed computing systems with complex service-oriented architectures." } @conference{vallee08framework, author = "Geoffroy R. Vall\'ee and Kulathep Charoenpornwattana and Christian Engelmann and Anand Tikotekar and Chokchai (Box) Leangsuksun and Thomas Naughton and Stephen L. Scott", title = "A Framework For Proactive Fault Tolerance", booktitle = "Proceedings of the $3^{rd}$ International Conference on Availability, Reliability and Security ({ARES}) 2008", pages = "659--664", month = mar # "~4-7, ", year = "2007", address = "Barcelona, Spain", publisher = PUBLISHER_IEEE_CS, isbn = "978-0-7695-3102-1", url = "http://www.csm.ornl.gov/~engelman/publications/vallee08framework.pdf", abstract = "Fault tolerance is a major concern to guarantee availability of critical services as well as application execution. Traditional approaches for fault tolerance include checkpoint/restart or duplication. However it is also possible to anticipate failures and proactively take action before failures occur in order to minimize failure impact on the system and application execution. This document presents a proactive fault tolerance framework. This framework can use different proactive fault tolerance mechanisms, i.e. migration and pause/unpause. The framework also allows the implementation of new proactive fault tolerance policies thanks to a modular architecture. A first proactive fault tolerance policy has been implemented and preliminary experimentations have been done based on system-level virtualization and compared with results obtained by simulation." } @conference{koenning08virtualized, author = "Bj{\"o}rn K{\"o}nning and Christian Engelmann and Stephen L. Scott and George A. (Al) Geist", title = "Virtualized Environments for the {Harness} High Performance Computing Workbench", booktitle = "Proceedings of the $16^{th}$ {Euromicro} International Conference on Parallel, Distributed and network-based Processing ({PDP}) 2008", pages = "133--140", month = feb # "~13-15, ", year = "2008", address = "Toulouse, France", publisher = PUBLISHER_IEEE_CS, isbn = "978-0-7695-3089-5", url = "http://www.csm.ornl.gov/~engelman/publications/koenning08virtualized.pdf", abstract = "This paper describes recent accomplishments in providing a virtualized environment concept and prototype for scientific application development and deployment as part of the Harness High Performance Computing (HPC) Workbench research effort. The presented work focuses on tools and mechanisms that simplify scientific application development and deployment tasks, such that only minimal adaptation is needed when moving from one HPC system to another or after HPC system upgrades. The overall technical approach focuses on the concept of adapting the HPC system environment to the actual needs of individual scientific applications instead of the traditional scheme of adapting scientific applications to individual HPC system environment properties. The presented prototype implementation is based on the mature and lightweight chroot virtualization approach for Unix-type systems with a focus on virtualized file system structure and virtualized shell environment variables utilizing virtualized environment configuration descriptions in Extensible Markup Language (XML) format. The presented work can be easily extended to other virtualization technologies, such as system-level virtualization solutions using hypervisors." } @conference{vallee08system, author = "Geoffroy R. Vall\'ee and Thomas Naughton and Christian Engelmann and Hong H. Ong and Stephen L. Scott", title = "System-level Virtualization for High Performance Computing", booktitle = "Proceedings of the $16^{th}$ {Euromicro} International Conference on Parallel, Distributed and network-based Processing ({PDP}) 2008", pages = "636--643", month = feb # "~13-15, ", year = "2008", address = "Toulouse, France", publisher = PUBLISHER_IEEE_CS, isbn = "978-0-7695-3089-5", url = "http://www.csm.ornl.gov/~engelman/publications/vallee08system.pdf", abstract = "System-level virtualization has been a research topic since the 70's but regained popularity during the past few years because of the availability of efficient solution such as Xen and the implementation of hardware support in commodity processors (e.g. Intel-VT, AMD-V). However, a majority of system-level virtualization projects is guided by the server consolidation market. As a result, current virtualization solutions appear to not be suitable for high performance computing (HPC) which is typically based on large-scale systems. On another hand there is significant interest in exploiting virtual machines (VMs) within HPC for a number of other reasons. By virtualizing the machine, one is able to run a variety of operating systems and environments as needed by the applications. Virtualization allows users to isolate workloads, improving security and reliability. It is also possible to support non-native environments and/or legacy operating environments through virtualization. In addition, it is possible to balance work loads, use migration techniques to relocate applications from failing machines, and isolate fault systems for repair. This document presents the challenges for the implementation of a system-level virtualization solution for HPC. It also presents a brief survey of the different approaches and techniques to address these challenges." } @conference{ou07symmetric, author = "Li Ou and Christian Engelmann and Xubin (Ben) He and Xin Chen and Stephen L. Scott", title = "Symmetric Active/Active Metadata Service for Highly Available Cluster Storage Systems", booktitle = "Proceedings of the $19^{th}$ {IASTED} International Conference on Parallel and Distributed Computing and Systems ({PDCS}) 2007", pages = "", month = nov # "~19-21, ", year = "2007", address = "Cambridge, MA, USA", publisher = PUBLISHER_ACTA, isbn = "978-0-88986-703-1", url = "http://www.csm.ornl.gov/~engelman/publications/ou07symmetric.pdf", abstract = "In a typical distributed storage system, metadata is stored and managed by dedicated metadata servers. One way to improve the availability of distributed storage systems is to deploy multiple metadata servers. Past research focused on the active/standby model, where each active server has at least one redundant idle backup. However, interruption of service and loss of service state may occur during a fail-over depending on the used replication technique. The research in this paper targets the symmetric active/active replication model using multiple redundant service nodes running in virtual synchrony. In this model, service node failures do not cause a fail-over to a backup and there is no disruption of service or loss of service state. We propose a fast delivery protocol to reduce the latency of total order broadcast. Our prototype implementation shows that high availability of metadata servers can be achieved with an acceptable performance trade-off using the active/active metadata server solution." } @conference{disaverio07distributed, author = "Emanuele Di Saverio and Marco Cesati and Christian Di Biagio and Guido Pennella and Christian Engelmann", title = "Distributed Real-Time Computing with {Harness}", booktitle = "Lecture Notes in Computer Science: Proceedings of the $14^{th}$ {European} {PVM/MPI} Users` Group Meeting ({EuroPVM/MPI}) 2007", pages = "281--288", volume = "4757", month = sep # "~30 - " # oct # "~3, ", year = "2007", address = "Paris, France", publisher = PUBLISHER_SPRINGER, isbn = "978-3-540-75415-2, ISSN 0302-9743", url = "http://www.csm.ornl.gov/~engelman/publications/disaverio07distributed.pdf", abstract = "Modern parallel and distributed computing solutions are often built onto a ``middleware'' software layer providing a higher and common level of service between computational nodes. Harness is an adaptable, plugin-based middleware framework for parallel and distributed computing. This paper reports recent research and development results of using Harness for real-time distributed computing applications in the context of an industrial environment with the needs to perform several safety critical tasks. The presented work exploits the modular architecture of Harness in conjunction with a lightweight threaded implementation to resolve several real-time issues by adding three new Harness plug-ins to provide a prioritized lightweight execution environment, low latency communication facilities, and local timestamped event logging." } @conference{ou07fast, author = "Li Ou and Xubin (Ben) He and Christian Engelmann and Stephen L. Scott", title = "A Fast Delivery Protocol for Total Order Broadcasting", booktitle = "Proceedings of the $16^{th}$ {IEEE} International Conference on Computer Communications and Networks ({ICCCN}) 2007", pages = "730--734", month = aug # "~13-16, ", year = "2007", address = "Honolulu, HI, USA", publisher = PUBLISHER_IEEE_CS, isbn = "978-1-42441-251-8, ISSN 1095-2055", url = "http://www.csm.ornl.gov/~engelman/publications/ou07fast.pdf", abstract = "Sequencer, privilege-based, and communication history algorithms are popular approaches to implement total ordering, where communication history algorithms are most suitable for parallel computing systems, because they provide best performance under heavy work load. Unfortunately, post-transmission delay of communication history algorithms is most apparent when a system is idle. In this paper, we propose a fast delivery protocol to reduce the latency of message ordering. The protocol optimizes the total ordering process by waiting for messages only from a subset of the machines in the group, and by fast acknowledging messages on behalf of other machines. Our test results indicate that the fast delivery protocol is suitable for both idle and heavy load systems, while reducing the latency of message ordering." } @conference{nagarajan07proactive, author = "Arun B. Nagarajan and Frank Mueller and Christian Engelmann and Stephen L. Scott", title = "Proactive Fault Tolerance for {HPC} with {Xen} Virtualization", booktitle = "Proceedings of the $21^{st}$ {ACM} International Conference on Supercomputing ({ICS}) 2007", pages = "23--32", month = jun # "~16-20, ", year = "2007", address = "Seattle, WA, USA", publisher = PUBLISHER_ACM, isbn = "978-1-59593-768-1", url = "http://www.csm.ornl.gov/~engelman/publications/nagarajan07proactive.pdf", abstract = "Large-scale parallel computing is relying increasingly on clusters with thousands of processors. At such large counts of compute nodes, faults are becoming common place. Current techniques to tolerate faults focus on reactive schemes to recover from faults and generally rely on a checkpoint/restart mechanism. Yet, in today`s systems, node failures can often be anticipated by detecting a deteriorating health status. Instead of a reactive scheme for fault tolerance (FT), we are promoting a proactive one where processes automatically migrate from ``unhealthy'' nodes to healthy ones. Our approach relies on operating system virtualization techniques exemplified by but not limited to Xen. This paper contributes an automatic and transparent mechanism for proactive FT for arbitrary MPI applications. It leverages virtualization techniques combined with health monitoring and load-based migration. We exploit Xen`s live migration mechanism for a guest operating system (OS) to migrate an MPI task from a health-deteriorating node to a healthy one without stopping the MPI task during most of the migration. Our proactive FT daemon orchestrates the tasks of health monitoring, load determination and initiation of guest OS migration. Experimental results demonstrate that live migration hides migration costs and limits the overhead to only a few seconds making it an attractive approach to realize FT in HPC systems. Overall, our enhancements make proactive FT a valuable asset for long-running MPI application that is complementary to reactive FT using full checkpoint/restart schemes since checkpoint frequencies can be reduced as fewer unanticipated failures are encountered. In the context of OS virtualization, we believe that this is the first comprehensive study of proactive fault tolerance where live migration is actually triggered by health monitoring." } @conference{engelmann07middleware, author = "Christian Engelmann and Hong H. Ong and Stephen L. Scott", title = "Middleware in Modern High Performance Computing System Architectures", booktitle = "Lecture Notes in Computer Science: Proceedings of the $7^{th}$ International Conference on Computational Science ({ICCS}) 2007, Part II: $4^{th}$ Special Session on Collaborative and Cooperative Environments ({CCE}) 2007", volume = "4488", pages = "784--791", month = may # "~27-30, ", year = "2007", address = "Beijing, China", publisher = PUBLISHER_SPRINGER, isbn = "3-5407-2585-5, ISSN 0302-9743", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann07middleware.pdf", abstract = "A recent trend in modern high performance computing (HPC) system architectures employs ``lean'' compute nodes running a lightweight operating system (OS). Certain parts of the OS as well as other system software services are moved to service nodes in order to increase performance and scalability. This paper examines the impact of this HPC system architecture trend on HPC ``middleware'' software solutions, which traditionally equip HPC systems with advanced features, such as parallel and distributed programming models, appropriate system resource management mechanisms, remote application steering and user interaction techniques. Since the approach of keeping the compute node software stack small and simple is orthogonal to the middleware concept of adding missing OS features between OS and application, the role and architecture of middleware in modern HPC systems needs to be revisited. The result is a paradigm shift in HPC middleware design, where single middleware services are moved to service nodes, while runtime environments (RTEs) continue to reside on compute nodes." } @conference{engelmann07transparent, author = "Christian Engelmann and Stephen L. Scott and Chokchai (Box) Leangsuksun and Xubin (Ben) He", title = "Transparent Symmetric Active/Active Replication for Service-Level High Availability", booktitle = "Proceedings of the $7^{th}$ {IEEE} International Symposium on Cluster Computing and the Grid ({CCGrid}) 2007: $7^{th}$ International Workshop on Global and Peer-to-Peer Computing ({GP2PC}) 2007", pages = "755--760", month = may # "~14-17, ", year = "2007", address = "Rio de Janeiro, Brazil", publisher = PUBLISHER_IEEE_CS, isbn = "0-7695-2833-3", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann07transparent.pdf", abstract = "As service-oriented architectures become more important in parallel and distributed computing systems, individual service instance reliability as well as appropriate service redundancy becomes an essential necessity in order to increase overall system availability. This paper focuses on providing redundancy strategies using service-level replication techniques. Based on previous research using symmetric active/active replication, this paper proposes a transparent symmetric active/active replication approach that allows for more reuse of code between individual service-level replication implementations by using a virtual communication layer. Service- and client-side interceptors are utilized in order to provide total transparency. Clients and servers are unaware of the replication infrastructure as it provides all necessary mechanisms internally." } @conference{engelmann07programming, author = "Christian Engelmann and Stephen L. Scott and Chokchai (Box) Leangsuksun and Xubin (Ben) He", title = "On Programming Models for Service-Level High Availability", booktitle = "Proceedings of the $2^{nd}$ International Conference on Availability, Reliability and Security ({ARES}) 2007", pages = "999--1006", month = apr # "~10-13, ", year = "2007", address = "Vienna, Austria", publisher = PUBLISHER_IEEE_CS, isbn = "0-7695-2775-2", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann07programming.pdf", abstract = "This paper provides an overview of existing programming models for service-level high availability and investigates their differences, similarities, advantages, and disadvantages. Its goal is to help to improve reuse of code and to allow adaptation to quality of service requirements by using a uniform programming model description. It further aims at encouraging a discussion about these programming models and their provided quality of service, such as availability, performance, serviceability, usability, and applicability. Within this context, the presented research focuses on providing high availability for services running on head and service nodes of high-performance computing systems." } @conference{wang07job, author = "Chao Wang and Frank Mueller and Christian Engelmann and Stephen L. Scott", title = "A Job Pause Service under {LAM/MPI+BLCR} for Transparent Fault Tolerance", booktitle = "Proceedings of the $21^{st}$ {IEEE} International Parallel and Distributed Processing Symposium ({IPDPS}) 2007", pages = "1-10", month = mar # "~26-30, ", year = "2007", address = "Long Beach, CA, USA", publisher = PUBLISHER_ACM, isbn = "978-1-59593-768-1", url = "http://www.csm.ornl.gov/~engelman/publications/wang07job.pdf", abstract = "Checkpoint/restart (C/R) has become a requirement for long-running jobs in large-scale clusters due to a mean-time-to-failure (MTTF) in the order of hours. After a failure, C/R mechanisms generally require a complete restart of an MPI job from the last checkpoint. A complete restart, however, is unnecessary since all but one node are typically still alive. Furthermore, a restart may result in lengthy job requeuing even though the original job had not exceeded its time quantum. In this paper, we overcome these shortcomings. Instead of job restart, we have developed a transparent mechanism for job pause within LAM/MPI+BLCR. This mechanism allows live nodes to remain active and roll back to the last checkpoint while failed nodes are dynamically replaced by spares before resuming from the last checkpoint. Our methodology includes LAM/MPI enhancements in support of scalable group communication with fluctuating number of nodes, reuse of network connections, transparent coordinated checkpoint scheduling and a BLCR enhancement for job pause. Experiments in a cluster with the NAS Parallel Benchmark suite show that our overhead for job pause is comparable to that of a complete job restart. A minimal overhead of 5.6\% is only incurred in case migration takes place while the regular checkpoint overhead remains unchanged. Yet, our approach alleviates the need to reboot the LAM run-time environment, which accounts for considerable overhead resulting in net savings of our scheme in the experiments. Our solution further provides full transparency and automation with the additional benefit of reusing existing resources. Executing continues after failures within the scheduled job, {\em \textit{i.e.}}, the application staging overhead is not incurred again in contrast to a restart. Our scheme offers additional potential for savings through incremental checkpointing and proactive diskless live migration, which we are currently working on." } @conference{engelmann07configurable, author = "Christian Engelmann and Stephen L. Scott and Hong H. Ong and Geoffroy R. Vall\'ee and Thomas Naughton", title = "Configurable Virtualized System Environments for High Performance Computing", booktitle = "Proceedings of the $1^{st}$ Workshop on System-level Virtualization for High Performance Computing ({HPCVirt}) 2007, in conjunction with the $2^{nd}$ {ACM} {SIGOPS} European Conference on Computer Systems ({EuroSys}) 2007", month = mar # "~20, ", year = "2007", address = "Lisbon, Portugal", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann07configurable.pdf", abstract = "Existing challenges for current terascale high performance computing (HPC) systems are increasingly hampering the development and deployment efforts of system software and scientific applications for next-generation petascale systems. The expected rapid system upgrade interval toward petascale scientific computing demands an incremental strategy for the development and deployment of legacy and new large-scale scientific applications that avoids excessive porting. Furthermore, system software developers as well as scientific application developers require access to large-scale testbed environments in order to test individual solutions at scale. This paper proposes to address these issues at the system software level through the development of a virtualized system environment (VSE) for scientific computing. The proposed VSE approach enables ``plug-and-play'' supercomputing through desktop-to-cluster-to-petaflop computer system-level virtualization based on recent advances in hypervisor virtualization technologies. This paper describes the VSE system architecture in detail, discusses needed tools for VSE system management and configuration, and presents respective VSE use case scenarios." } @conference{engelmann06towards, author = "Christian Engelmann and Stephen L. Scott and Chokchai (Box) Leangsuksun and Xubin (Ben) He", title = "Towards High Availability for High-Performance Computing System Services: {A}ccomplishments and Limitations", booktitle = "Proceedings of the $4^{th}$ High Availability and Performance Workshop ({HAPCW}) 2006, in conjunction with the $7^{th}$ Los Alamos Computer Science Institute ({LACSI}) Symposium 2006", month = oct # "~17, ", year = "2006", address = "Santa Fe, NM, USA", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann06towards.pdf", abstract = "During the last several years, our teams at Oak Ridge National Laboratory, Louisiana Tech University, and Tennessee Technological University focused on efficient redundancy strategies for head and service nodes of high-performance computing (HPC) systems in order to pave the way for high availability (HA) in HPC. These nodes typically run critical HPC system services, like job and resource management, and represent single points of failure and control for an entire HPC system. The overarching goal of our research is to provide high-level reliability, availability, and serviceability (RAS) for HPC systems by combining HA and HPC technology. This paper summarizes our accomplishments, such as developed concepts and implemented proof-of-concept prototypes, and describes existing limitations, such as performance issues, which need to be dealt with for production-type deployment." } @conference{ou06achieving, author = "Li Ou and Xin Chen and Xubin (Ben) He and Christian Engelmann and Stephen L. Scott", title = "Achieving Computational {I/O} Effciency in a High Performance Cluster Using Multicore Processors", booktitle = "Proceedings of the $4^{th}$ High Availability and Performance Workshop ({HAPCW}) 2006, in conjunction with the $7^{th}$ Los Alamos Computer Science Institute ({LACSI}) Symposium 2006", month = oct # "~17, ", year = "2006", address = "Santa Fe, NM, USA", url = "http://www.csm.ornl.gov/~engelman/publications/ou06achieving.pdf", abstract = "Cluster computing has become one of the most popular platforms for high-performance computing today. The recent popularity of multicore processors provides a flexible way to increase the computational capability of clusters. Although the system performance may improve with multicore processors in a cluster, I/O requests initiated by multiple cores may saturate the I/O bus, and furthermore increase the latency by issuing multiple non-contiguous disk accesses. In this paper, we propose an asymmetric collective I/O for multicore processors to improve multiple non-contiguous accesses. In our configuration, one core in each multicore processor is designated as the coordinator, and others serve as computing cores. The coordinator is responsible for aggregating I/O operations from computing cores and submitting a contiguous request. The coordinator allocates contiguous memory buffers on behalf of other cores to avoid redundant data copies." } @conference{uhlemann06joshua, author = "Kai Uhlemann and Christian Engelmann and Stephen L. Scott", title = "{JOSHUA}: {S}ymmetric Active/Active Replication for Highly Available {HPC} Job and Resource Management", booktitle = "Proceedings of the $8^{th}$ {IEEE} International Conference on Cluster Computing ({Cluster}) 2006", pages = "1-10", month = sep # "~25-28, ", year = "2006", address = "Barcelona, Spain", publisher = PUBLISHER_IEEE_CS, isbn = "1-4244-0328-6, ISSN 1552-5244", url = "http://www.csm.ornl.gov/~engelman/publications/uhlemann06joshua.pdf", abstract = "Most of today`s HPC systems employ a single head node for control, which represents a single point of failure as it interrupts an entire HPC system upon failure. Furthermore, it is also a single point of control as it disables an entire HPC system until repair. One of the most important HPC system service running on the head node is the job and resource management. If it goes down, all currently running jobs loose the service they report back to. They have to be restarted once the head node is up and running again. With this paper, we present a generic approach for providing symmetric active/active replication for highly available HPC job and resource management. The JOSHUA solution provides a virtually synchronous environment for continuous availability without any interruption of service and without any loss of state. Replication is performed externally via the PBS service interface without the need to modify any service code. Test results as well as availability analysis of our proof-of-concept prototype implementation show that continuous availability can be provided by JOSHUA with an acceptable performance trade-off." } @conference{baumann06parallel, author = "Ronald Baumann and Christian Engelmann and George A. (Al) Geist", title = "A Parallel Plug-in Programming Paradigm", booktitle = "Lecture Notes in Computer Science: Proceedings of the $7^{th}$ International Conference on High Performance Computing and Communications ({HPCC}) 2006", volume = "4208", pages = "823--832", month = sep # "~13-15, ", year = "2006", address = "Munich, Germany", publisher = PUBLISHER_SPRINGER, isbn = "978-3-540-39368-9, ISSN 0302-9743", url = "http://www.csm.ornl.gov/~engelman/publications/baumann06parallel.pdf", abstract = "Software component architectures allow assembly of applications from individual software modules based on clearly defined programming interfaces, thus improving the reuse of existing solutions and simplifying application development. Furthermore, the plug-in programming paradigm additionally enables runtime reconfigurability, making it possible to adapt to changing application needs, such as different application phases, and system properties, like resource availability, by loading/unloading appropriate software modules. Similar to parallel programs, parallel plug-ins are an abstraction for a set of cooperating individual plug-ins within a parallel application utilizing a software component architecture. Parallel programming paradigms apply to parallel plug-ins in the same way they apply to parallel programs. The research presented in this paper targets the clear definition of parallel plug-ins and the development of a parallel plug-in programming paradigm." } @conference{varma06scalable, author = "Jyothish Varma and Chao Wang and Frank Mueller and Christian Engelmann and Stephen L. Scott", title = "Scalable, Fault-Tolerant Membership for {MPI} Tasks on {HPC} Systems", booktitle = "Proceedings of the $20^{th}$ {ACM} International Conference on Supercomputing ({ICS}) 2006", pages = "219--228", month = jun # "~28-30, ", year = "2006", address = "Cairns, Australia", publisher = PUBLISHER_ACM, isbn = "1-59593-282-8", url = "http://www.csm.ornl.gov/~engelman/publications/varma06scalable.pdf", abstract = "Reliability is increasingly becoming a challenge for high-performance computing (HPC) systems with thousands of nodes, such as IBM's Blue Gene/L. A shorter mean-time-to-failure can be addressed by adding fault tolerance to reconfigure working nodes to ensure that communication and computation can progress. However, existing approaches fall short in providing scalability and small reconfiguration overhead within the fault-tolerant layer. This paper contributes a scalable approach to reconfigure the communication infrastructure after node failures. We propose a decentralized (peer-to-peer) protocol that maintains a consistent view of active nodes in the presence of faults. Our protocol shows response times in the order of hundreds of microseconds and single-digit milliseconds for reconfiguration using MPI over Blue Gene/L and TCP over Gigabit, respectively. The protocol can be adapted to match the network topology to further increase performance. We also verify experimental results against a performance model, which demonstrates the scalability of the approach. Hence, the membership service is suitable for deployment in the communication layer of MPI runtime systems, and we have integrated an early version into LAM/MPI." } @conference{okunbor06exploring, author = "Daniel I. Okunbor and Christian Engelmann and Stephen L. Scott", title = "Exploring Process Groups for Reliability, Availability and Serviceability of Terascale Computing Systems", booktitle = "Proceedings of the $2^{nd}$ International Conference on Computer Science and Information Systems 2006", month = jun # "~19-21, ", year = "2006", address = "Athens, Greece", url = "http://www.csm.ornl.gov/~engelman/publications/okunbor06exploring.pdf", abstract = "This paper presents various aspects of reliability, availability and serviceability (RAS) systems as they relate to group communication service, including reliable and total order multicast/broadcast, virtual synchrony, and failure detection. While the issue of availability, particularly high availability using replication-based architectures has recently received upsurge research interests, much still have to be done in understanding the basic underlying concepts for achieving RAS systems, especially in high-end and high performance computing (HPC) communities. Various attributes of group communication service and the prototype of symmetric active replication following ideas utilized in the Newtop protocol will be discussed. We explore the application of group communication service for RAS HPC, laying the groundwork for its integrated model." } @conference{engelmann06rmix, author = "Christian Engelmann and George A. (Al) Geist", title = "{RMIX}: {A} Dynamic, Heterogeneous, Reconfigurable Communication Framework", booktitle = "Lecture Notes in Computer Science: Proceedings of the $6^{th}$ International Conference on Computational Science ({ICCS}) 2006, Part II: $3^{rd}$ Special Session on Collaborative and Cooperative Environments ({CCE}) 2006", volume = "3992", pages = "573--580", month = may # "~28-31, ", year = "2006", address = "Reading, UK", publisher = PUBLISHER_SPRINGER, isbn = "3-540-34381-4, ISSN 0302-9743", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann06rmix.pdf", abstract = "RMIX is a dynamic, heterogeneous, reconfigurable communication framework that allows software components to communicate using various RMI/RPC protocols, such as ONC RPC, Java RMI and SOAP, by facilitating dynamically loadable provider plug-ins to supply different protocol stacks. With this paper, we present a native (C-based), flexible, adaptable, multi-protocol RMI/RPC communication framework that complements the Java-based RMIX variant previously developed by our partner team at Emory University. Our approach offers the same multi-protocol RMI/RPC services and advanced invocation semantics via a C-based interface that does not require an object-oriented programming language. This paper provides a detailed description of our RMIX framework architecture and some of its features. It describes the general use case of the RMIX framework and its integration into the Harness metacomputing environment in the form of a plug-in." } @conference{engelmann06active, author = "Christian Engelmann and Stephen L. Scott and Chokchai (Box) Leangsuksun and Xubin (Ben) He", title = "Active/Active Replication for Highly Available {HPC} System Services", booktitle = "Proceedings of the $1^{st}$ International Conference on Availability, Reliability and Security ({ARES}) 2006: $1^{st}$ International Workshop on Frontiers in Availability, Reliability and Security ({FARES}) 2006", pages = "639-645", month = apr # "~20-22, ", year = "2006", address = "Vienna, Austria", publisher = PUBLISHER_IEEE_CS, isbn = "0-7695-2567-9", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann06active.pdf", abstract = "Today`s high performance computing systems have several reliability deficiencies resulting in availability and serviceability issues. Head and service nodes represent a single point of failure and control for an entire system as they render it inaccessible and unmanageable in case of a failure until repair, causing a significant downtime. This paper introduces two distinct replication methods (internal and external) for providing symmetric active/active high availability for multiple head and service nodes running in virtual synchrony. It presents a comparison of both methods in terms of expected correctness, ease-of-use and performance based on early results from ongoing work in providing symmetric active/active high availability for two HPC system services (TORQUE and PVFS metadata server). It continues with a short description of a distributed mutual exclusion algorithm and a brief statement regarding the handling of Byzantine failures. This paper concludes with an overview of past and ongoing work, and a short summary of the presented research." } @conference{engelmann05concepts, author = "Christian Engelmann and Stephen L. Scott", title = "Concepts for High Availability in Scientific High-End Computing", booktitle = "Proceedings of the $3^{rd}$ High Availability and Performance Workshop ({HAPCW}) 2005, in conjunction with the $6^{th}$ Los Alamos Computer Science Institute ({LACSI}) Symposium 2005", month = oct # "~11, ", year = "2005", address = "Santa Fe, NM, USA", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann05concepts.pdf", abstract = "Scientific high-end computing (HEC) has become an important tool for scientists world-wide to understand problems, such as in nuclear fusion, human genomics and nanotechnology. Every year, new HEC systems emerge on the market with better performance and higher scale. With only very few exceptions, the overall availability of recently installed systems has been lower in comparison to the same deployment phase of their predecessors. In contrast to the experienced loss of availability, the demand for continuous availability has risen dramatically due to the recent trend towards capability computing. In this paper, we analyze the existing deficiencies of current HEC systems and present several high availability concepts to counter the experienced loss of availability and to alleviate the expected impact on next-generation systems. We explain the application of these concepts to current and future HEC systems and list past and ongoing related research. This paper closes with a short summary of the presented work and a brief discussion of future efforts." } @conference{limaye05jobsite, author = "Kshitij Limaye and Chokchai (Box) Leangsuksun and Zeno Greenwood and Stephen L. Scott and Christian Engelmann and Richard M. Libby and Kasidit Chanchio", title = "Job-Site Level Fault Tolerance for Cluster and {Grid} Environments", booktitle = "Proceedings of the $7^{th}$ {IEEE} International Conference on Cluster Computing ({Cluster}) 2005", pages = "1--9", month = sep # "~26-30, ", year = "2005", address = "Boston, MA, USA", publisher = PUBLISHER_IEEE_CS, isbn = "0-7803-9486-0, ISSN 1552-5244", url = "http://www.csm.ornl.gov/~engelman/publications/limaye05job-site.pdf", abstract = "In order to adopt high performance clusters and grid computing for mission critical applications, fault tolerance is a necessity. Common fault tolerance techniques in distributed systems are normally achieved with checkpoint-recovery and job replication on alternative resources, in cases of a system outage. The first approach depends on the system's MTTR while the latter approach depends on the availability of alternative sites to run replicas. There is a need for complementing these approaches by proactively handling failures at a job-site level, ensuring the system high availability with no loss of user submitted jobs. This paper discusses a novel fault tolerance technique that enables the job-site recovery in Beowulf cluster-based grid environments, whereas existing techniques give up a failed system by seeking alternative resources. Our results suggest sizable aggregate performance improvement during an implementation of our method in Globus-enabled HA-OSCAR. The technique called ``Smart Failover'' provides a transparent and graceful recovery mechanism that saves job states in a local job-manager queue and transfers those states to the backup server periodically, and in critical system events. Thus whenever a failover occurs, the backup server is able to restart the jobs from their last saved state." } @conference{song05umlbased, author = "Hertong Song and Chokchai (Box) Leangsuksun and Raja Nassar and Yudan Liu and Christian Engelmann and Stephen L. Scott", title = "{UML-based} {Beowulf} Cluster Availability Modeling", booktitle = "International Conference on Software Engineering Research and Practice ({SERP}) 2005", pages = "161--167", month = jun # "~27-30, ", year = "2005", address = "Las Vegas, NV, USA", publisher = PUBLISHER_CSREA, isbn = "1-932415-49-1" } @conference{engelmann05high, author = "Christian Engelmann and Stephen L. Scott", title = "High Availability for Ultra-Scale High-End Scientific Computing", booktitle = "Proceedings of the $2^{nd}$ International Workshop on Operating Systems, Programming Environments and Management Tools for High-Performance Computing on Clusters ({COSET-2}) 2005, in conjunction with the $19^{th}$ {ACM} International Conference on Supercomputing (ICS) 2005", month = jun # "~19, ", year = "2005", address = "Cambridge, MA, USA", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann05high.pdf", abstract = "Ultra-scale architectures for scientific high-end computing with tens to hundreds of thousands of processors, such as the IBM Blue Gene/L and the Cray X1, suffer from availability deficiencies, which impact the efficiency of running computational jobs by forcing frequent checkpointing of applications. Most systems are unable to handle runtime system configuration changes caused by failures and require a complete restart of essential system services, such as the job scheduler or MPI, or even of the entire machine. In this paper, we present a flexible, pluggable and component-based high availability framework that expands today`s effort in high availability computing of keeping a single server alive to include all machines cooperating in a high-end scientific computing environment, while allowing adaptation to system properties and application needs." } @conference{leangsuksun05asymmetric, author = "Chokchai (Box) Leangsuksun and Venkata K. Munganuru and Tong Liu and Stephen L. Scott and Christian Engelmann", title = "Asymmetric Active-Active High Availability for High-end Computing", booktitle = "Proceedings of the $2^{nd}$ International Workshop on Operating Systems, Programming Environments and Management Tools for High-Performance Computing on Clusters ({COSET-2}) 2005, in conjunction with the $19^{th}$ {ACM} International Conference on Supercomputing (ICS) 2005", month = jun # "~19, ", year = "2005", address = "Cambridge, MA, USA", url = "http://www.csm.ornl.gov/~engelman/publications/leangsuksun05asymmetric.pdf", abstract = "Linux clusters have become very popular for scientific computing at research institutions world-wide, because they can be easily deployed at a fairly low cost. However, the most pressing issues of today`s cluster solutions are availability and serviceability. The conventional Beowulf cluster architecture has a single head node connected to a group of compute nodes. This head node is a typical single point of failure and control, which severely limits availability and serviceability by effectively cutting off healthy compute nodes from the outside world upon overload or failure. In this paper, we describe a paradigm that addresses this issue using asymmetric active-active high availability. Our framework comprises of n + 1 head nodes, where n head nodes are active in the sense that they provide services to simultaneously incoming user requests. One standby server monitors all active servers and performs a fail-over in case of a detected outage. We present a prototype implementation based on a 2 + 1 solution and discuss initial results." } @conference{engelmann05superscalable, author = "Christian Engelmann and George A. (Al) Geist", title = "Super-Scalable Algorithms for Computing on 100,000 Processors", booktitle = "Lecture Notes in Computer Science: Proceedings of the $5^{th}$ International Conference on Computational Science ({ICCS}) 2005, Part I", volume = "3514", pages = "313--320", month = may # "~22-25, ", year = "2005", address = "Atlanta, GA, USA", publisher = PUBLISHER_SPRINGER, isbn = "978-3-540-26032-5, ISSN 0302-9743", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann05superscalable.pdf", abstract = "In the next five years, the number of processors in high-end systems for scientific computing is expected to rise to tens and even hundreds of thousands. For example, the IBM Blue Gene/L can have up to 128,000 processors and the delivery of the first system is scheduled for 2005. Existing deficiencies in scalability and fault-tolerance of scientific applications need to be addressed soon. If the number of processors grows by a magnitude and efficiency drops by a magnitude, the overall effective computing performance stays the same. Furthermore, the mean time to interrupt of high-end computer systems decreases with scale and complexity. In a 100,000-processor system, failures may occur every couple of minutes and traditional checkpointing may no longer be feasible. With this paper, we summarize our recent research in super-scalable algorithms for computing on 100,000 processors. We introduce the algorithm properties of scale invariance and natural fault tolerance, and discuss how they can be applied to two different classes of algorithms. We also describe a super-scalable diskless checkpointing algorithm for problems that can't be transformed into a super-scalable variant, or where other solutions are more efficient. Finally, a 100,000-processor simulator is presented as a platform for testing and experimentation." } @conference{engelmann05lightweight, author = "Christian Engelmann and George A. (Al) Geist", title = "A Lightweight Kernel for the Harness Metacomputing Framework", booktitle = "Proceedings of the $19^{th}$ {IEEE} International Parallel and Distributed Processing Symposium ({IPDPS}) 2005: $14^{th}$ Heterogeneous Computing Workshop ({HCW}) 2005", month = apr # "~4, ", year = "2005", address = "Denver, CO, USA", publisher = PUBLISHER_IEEE_CS, isbn = "0-7695-2312-9, ISSN 1530-2075", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann05lightweight.pdf", abstract = "Harness is a pluggable heterogeneous Distributed Virtual Machine (DVM) environment for parallel and distributed scientific computing. This paper describes recent improvements in the Harness kernel design. By using a lightweight approach and moving previously integrated system services into software modules, the software becomes more versatile and adaptable. This paper outlines these changes and explains the major Harness kernel components in more detail. A short overview is given of ongoing efforts in integrating RMIX, a dynamic heterogeneous reconfigurable communication framework, into the Harness environment as a new plug-in software module. We describe the overall impact of these changes and how they relate to other ongoing work." } @conference{engelmann04high, author = "Christian Engelmann and Stephen L. Scott and George A. (Al) Geist", title = "High Availability through Distributed Control", booktitle = "Proceedings of the $2^{nd}$ High Availability and Performance Workshop ({HAPCW}) 2004, in conjunction with the $5^{th}$ Los Alamos Computer Science Institute ({LACSI}) Symposium 2004", month = oct # "~12, ", year = "2004", address = "Santa Fe, NM, USA", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann04high.pdf", abstract = "Cost-effective, flexible and efficient scientific simulations in cutting-edge research areas utilize huge high-end computing resources with thousands of processors. In the next five to ten years the number of processors in such computer systems will rise to tens of thousands, while scientific application running times are expected to increase further beyond the Mean-Time-To-Interrupt (MTTI) of hardware and system software components. This paper describes the ongoing research in heterogeneous adaptable reconfigurable networked systems (Harness) and its recent achievements in the area of high availability distributed virtual machine environments for parallel and distributed scientific computing. It shows how a distributed control algorithm is able to steer a distributed virtual machine process in virtual synchrony while maintaining consistent replication for high availability. It briefly illustrates ongoing work in heterogeneous reconfigurable communication frameworks and security mechanisms. The paper continues with a short overview of similar research in reliable group communication frameworks, fault-tolerant process groups and highly available distributed virtual processes. It closes with a brief discussion of possible future research directions." } @conference{he04highly, author = "Xubin (Ben) He and Li Ou and Stephen L. Scott and Christian Engelmann", title = "A Highly Available Cluster Storage System using Scavenging", booktitle = "Proceedings of the $2^{nd}$ High Availability and Performance Workshop ({HAPCW}) 2004, in conjunction with the $5^{th}$ Los Alamos Computer Science Institute ({LACSI}) Symposium 2004", month = oct # "~12, ", year = "2004", address = "Santa Fe, NM, USA", url = "http://www.csm.ornl.gov/~engelman/publications/he04highly.pdf", abstract = "Highly available data storage for high-performance computing is becoming increasingly more critical as high-end computing systems scale up in size and storage systems are developed around network-centered architectures. A promising solution is to harness the collective storage potential of individual workstations much as we harness idle CPU cycles due to the excellent price/performance ratio and low storage usage of most commodity workstations. For such a storage system, metadata consistency is a key issue assuring storage system availability as well as data reliability. In this paper, we present a decentralized metadata management scheme that improves storage availability without sacrificing performance." } @conference{engelmann03diskless, author = "Christian Engelmann and George A. (Al) Geist", title = "A Diskless Checkpointing Algorithm for Super-scale Architectures Applied to the Fast Fourier Transform", booktitle = "Proceedings of the Challenges of Large Applications in Distributed Environments Workshop ({CLADE}) 2003, in conjunction with the $12^{th}$ {IEEE} International Symposium on High Performance Distributed Computing ({HPDC}) 2003", pages = "47", month = jun # "~21, ", year = "2003", address = "Seattle, WA, USA", publisher = PUBLISHER_IEEE_CS, isbn = "0-7695-1984-9", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann03diskless.pdf", abstract = "This paper discusses the issue of fault-tolerance in distributed computer systems with tens or hundreds of thousands of diskless processor units. Such systems, like the IBM Blue Gene/L, are predicted to be deployed in the next five to ten years. Since a 100,000-processor system is going to be less reliable, scientific applications need to be able to recover from occurring failures more efficiently. In this paper, we adapt the present technique of diskless checkpointing to such huge distributed systems in order to equip existing scientific algorithms with super-scalable fault-tolerance. First, we discuss the method of diskless checkpointing, then we adapt this technique to super-scale architectures and finally we present results from an implementation of the Fast Fourier Transform that uses the adapted technique to achieve super-scale fault-tolerance." } @conference{engelmann02distributed, author = "Christian Engelmann and Stephen L. Scott and George A. (Al) Geist", title = "Distributed Peer-to-Peer Control in {Harness}", booktitle = "Lecture Notes in Computer Science: Proceedings of the $2^{nd}$ International Conference on Computational Science ({ICCS}) 2002, Part II: Workshop on Global and Collaborative Computing", volume = "2330", pages = "720--727", month = apr # "~21-24, ", year = "2002", address = "Amsterdam, The Netherlands", publisher = PUBLISHER_SPRINGER, isbn = "3-540-43593-X, ISSN 0302-9743", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann02distributed.pdf", abstract = "Harness is an adaptable fault-tolerant virtual machine environment for next-generation heterogeneous distributed computing developed as a follow on to PVM. It additionally enables the assembly of applications from plug-ins and provides fault-tolerance. This work describes the distributed control, which manages global state replication to ensure a high-availability of service. Group communication services achieve an agreement on an initial global state and a linear history of global state changes at all members of the distributed virtual machine. This global state is replicated to all members to easily recover from single, multiple and cascaded faults. A peer-to-peer ring network architecture and tunable multi-point failure conditions provide heterogeneity and scalability. Finally, the integration of the distributed control into the multi-threaded kernel architecture of Harness offers a fault-tolerant global state database service for plug-ins and applications." } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Conference Poster Presentations %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @misc{geist08harness, author = "George A. (Al) Geist and Christian Engelmann and Jack J. Dongarra and George Bosilca and Magdalena S\l{}awi\'nska and Jaros\l{}aw S\l{}awi\'nski", title = "The {Harness} Workbench: {U}nified and Adaptive Access to Diverse High-Performance Computing Platforms", month = mar # "~30 - " # apr # "~5, ", year = "2008", howpublished = "{Poster at the $1^{st}$ High-Performance Computer Science Week ({HPCSW}) 2008, Denver, CO, USA}", url = "http://www.csm.ornl.gov/~engelman/publications/geist08harness.pdf", abstract = "This poster summarizes our past and ongoing research and development efforts in novel software solutions for providing unified and adaptive access to diverse high-performance computing (HPC) platforms. The poster showcases developed proof-of-concept implementations of tools and mechanisms that simplify scientific application development and deployment tasks, such that only minimal adaptation is needed when moving from one HPC system to another or after HPC system upgrades." } @misc{scott08resiliency, author = "Stephen L. Scott and Christian Engelmann and Hong H. Ong and Geoffroy R. Vall\'ee and Thomas Naughton and Anand Tikotekar and Chokchai (Box) Leangsuksun and Nichamon Naksinehaboon and Raja Nassar and Mihaela Paun and Xubin (Ben) He and Li Ou and Xin Chen", title = "Resiliency for High-Performance Computing Systems", month = mar # "~30 - " # apr # "~5, ", year = "2008", howpublished = "{Poster at the $1^{st}$ High-Performance Computer Science Week ({HPCSW}) 2008, Denver, CO, USA}", url = "http://www.csm.ornl.gov/~engelman/publications/scott08resiliency.pdf", abstract = "This poster summarizes our past and ongoing research and development efforts in novel system software solutions for providing high-level reliability, availability and serviceability (RAS) for next-generation extreme-scale high-performance computing (HPC) systems and beyond. The poster showcases results of developed proof-of-concept implementations and performed theoretical analyses, outlines planned research and development activities, and presents respective initial results." } @misc{scott08systemlevel, author = "Stephen L. Scott and Geoffroy R. Vall\'ee and Thomas Naughton and Anand Tikotekar and Christian Engelmann and Hong H. Ong", title = "System-level Virtualization for for High-Performance Computing", month = mar # "~30 - " # apr # "~5, ", year = "2008", howpublished = "{Poster at the $1^{st}$ High-Performance Computer Science Week ({HPCSW}) 2008, Denver, CO, USA}", url = "http://www.csm.ornl.gov/~engelman/publications/scott08systemlevel.pdf", abstract = "This poster summarizes our past and ongoing research and development efforts in novel system software solutions for providing a virtual system environment (VSE) for next-generation extreme-scale high-performance computing (HPC) systems and beyond. The poster showcases results of developed proof-of-concept implementations and performed theoretical analyses, outlines planned research and development activities, and presents respective initial results." } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Invited Talks and Lectures %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @misc{engelmann08resiliency, author = "Christian Engelmann", title = "Resiliency for High-Performance Computing", month = apr # "~10-12, ", year = "2008", howpublished = "{Invited talk at the $2^{nd}$ Collaborative and Grid Computing Technologies Workshop ({CGCTW}) 2008, Cancun, Mexico}", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann08resiliency.ppt.pdf", abstract = "The continuing growth in high-performance computing (HPC) system scale poses a challenge for system software and scientific applications with respect to reliability, availability and serviceability (RAS). With only very few exceptions, the availability of recently installed systems has been lower in comparison to the same deployment phase of their predecessors. As a result, sites lower allowable job run times in order to force applications to store intermediate results (checkpoints) as insurance against lost computation time. However, checkpoints themselves waste valuable computation time and resources. In contrast to the experienced loss of availability, the demand for continuous availability has risen dramatically with the trend towards capability computing, which drives the race for scientific discovery by running applications on the fastest machines available while desiring significant amounts of time (weeks and months) without interruption. These machines must be able to run in the event of frequent interrupts in such a manner that the capability is not severely degraded. Thus, research and development of scalable RAS technologies is paramount to the success of future extreme-scale systems. This talk summarizes our past accomplishments, ongoing work, and future plans in the area of high-level RAS for HPC." } @misc{engelmann08advanced, author = "Christian Engelmann", title = "Advanced Fault Tolerance Solutions for High Performance Computing", month = feb # "~11, ", year = "2008", howpublished = "{Seminar at the Laboratoire d'Analyse et d'Architecture des Syst\`{e}mes, Centre National de la Recherche Scientifique, Toulouse, France}", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann08advanced.ppt.pdf", abstract = "The continuing growth in high performance computing (HPC) system scale poses a challenge for system software and scientific applications with respect to reliability, availability and serviceability (RAS). With only very few exceptions, the availability of recently installed systems has been lower in comparison to the same deployment phase of their predecessors. As a result, sites lower allowable job run times in order to force applications to store intermediate results (checkpoints) as insurance against lost computation time. However, checkpoints themselves waste valuable computation time and resources. In contrast to the experienced loss of availability, the demand for continuous availability has risen dramatically with the trend towards capability computing, which drives the race for scientific discovery by running applications on the fastest machines available while desiring significant amounts of time (weeks and months) without interruption. These machines must be able to run in the event of frequent interrupts in such a manner that the capability is not severely degraded. Thus, research and development of scalable RAS technologies is paramount to the success of future extreme-scale systems. This talk summarizes our accomplishments in the area of high-level RAS for HPC, such as developed concepts and implemented proof-of-concept prototypes, and describes existing limitations, such as performance issues, which need to be dealt with for production-type deployment." } @misc{engelmann07service, author = "Christian Engelmann", title = "Service-Level High Availability in Parallel and Distributed Systems", month = oct # "~10, ", year = "2007", howpublished = "{Seminar at the Department of Computer Science, University of Reading, Reading, United Kingdom}", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann07service.pdf", abstract = "As service-oriented architectures become more important in parallel and distributed computing systems, individual service instance reliability as well as appropriate service redundancy are essential to increase overall system availability. This talk focuses on redundancy strategies using service-level replication techniques. An overview of existing programming models for service-level high availability is presented and their differences, similarities, advantages, and disadvantages are discussed. Recent advances in providing service-level symmetric active/active high availability are discussed. While the primary target of the presented research is high availability for service nodes in tightly-coupled extreme-scale high-performance computing (HPC) systems, it is also applicable to loosely-coupled distributed computing scenarios." } @misc{engelmann07advanced2, author = "Christian Engelmann", title = "Advanced Fault Tolerance Solutions for High Performance Computing", month = jun # "~8, ", year = "2007", howpublished = "{Invited talk at the Workshop on Trends, Technologies and Collaborative Opportunities in High Performance and Grid Computing, Khon Kean, Thailand}", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann07advanced2.ppt.pdf", abstract = "The continuing growth in high performance computing (HPC) system scale poses a challenge for system software and scientific applications with respect to reliability, availability and serviceability (RAS). With only very few exceptions, the availability of recently installed systems has been lower in comparison to the same deployment phase of their predecessors. As a result, sites lower allowable job run times in order to force applications to store intermediate results (checkpoints) as insurance against lost computation time. However, checkpoints themselves waste valuable computation time and resources. In contrast to the experienced loss of availability, the demand for continuous availability has risen dramatically with the trend towards capability computing, which drives the race for scientific discovery by running applications on the fastest machines available while desiring significant amounts of time (weeks and months) without interruption. These machines must be able to run in the event of frequent interrupts in such a manner that the capability is not severely degraded. Thus, research and development of scalable RAS technologies is paramount to the success of future extreme-scale systems. This talk summarizes our accomplishments in the area of high-level RAS for HPC, such as developed concepts and implemented proof-of-concept prototypes, and describes existing limitations, such as performance issues, which need to be dealt with for production-type deployment." } @misc{engelmann07advanced, author = "Christian Engelmann", title = "Advanced Fault Tolerance Solutions for High Performance Computing", month = jun # "~4-5, ", year = "2007", howpublished = "{Invited talk at the Workshop on Trends, Technologies and Collaborative Opportunities in High Performance and Grid Computing, Bangkok, Thailand}", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann07advanced.ppt.pdf", abstract = "The continuing growth in high performance computing (HPC) system scale poses a challenge for system software and scientific applications with respect to reliability, availability and serviceability (RAS). With only very few exceptions, the availability of recently installed systems has been lower in comparison to the same deployment phase of their predecessors. As a result, sites lower allowable job run times in order to force applications to store intermediate results (checkpoints) as insurance against lost computation time. However, checkpoints themselves waste valuable computation time and resources. In contrast to the experienced loss of availability, the demand for continuous availability has risen dramatically with the trend towards capability computing, which drives the race for scientific discovery by running applications on the fastest machines available while desiring significant amounts of time (weeks and months) without interruption. These machines must be able to run in the event of frequent interrupts in such a manner that the capability is not severely degraded. Thus, research and development of scalable RAS technologies is paramount to the success of future extreme-scale systems. This talk summarizes our accomplishments in the area of high-level RAS for HPC, such as developed concepts and implemented proof-of-concept prototypes, and describes existing limitations, such as performance issues, which need to be dealt with for production-type deployment." } @misc{engelmann07operating, author = "Christian Engelmann", title = "Operating System Research at {ORNL}: {S}ystem-level Virtualization", month = apr # "~10, ", year = "2007", howpublished = "{Seminar at the Institute of Graphics and Parallel Processing, Johannes Kepler University, Linz, Austria}", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann07operating.ppt.pdf", abstract = "The emergence of virtualization enabled hardware, such as the latest generation AMD and Intel processors, has raised significant interest in High Performance Computing (HPC) community. In particular, system-level virtualization provides an opportunity to advance the design and development of operating systems, programming environments, administration practices, and resource management tools. This leads to some potential research topics for HPC, such as failure tolerance, system management, and solutions for application porting to new HPC platforms. This talk will present an overview of the research in System-level Virtualization taking place by the Systems Research Team in the Computer Science Research Group at Oak Ridge National Laboratory." } @misc{engelmann07towards, author = "Christian Engelmann", title = "Towards High Availability for High-Performance Computing System Services: {A}ccomplishments and Limitations", month = mar # "~14, ", year = "2007", howpublished = "{Seminar at the Department of Computer Science, University of Reading, Reading, United Kingdom}", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann07towards.pdf", abstract = "During the last several years, our teams at Oak Ridge National Laboratory, Louisiana Tech University, and Tennessee Technological University focused on efficient redundancy strategies for head and service nodes of high-performance computing (HPC) systems in order to pave the way for high availability (HA) in HPC. These nodes typically run critical HPC system services, like job and resource management, and represent single points of failure and control for an entire HPC system. The overarching goal of our research is to provide high-level reliability, availability, and serviceability (RAS) for HPC systems by combining HA and HPC technology. This talk summarizes our accomplishments, such as developed concepts and implemented proof-of-concept prototypes, and describes existing limitations, such as performance issues, which need to be dealt with for production-type deployment." } @misc{engelmann06high, author = "Christian Engelmann", title = "High Availability for Ultra-Scale High-End Scientific Computing", month = jun # "~9, ", year = "2006", howpublished = "{Seminar at the Department of Computer Science, University of Reading, Reading, United Kingdom}", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann06high.ppt.pdf", abstract = "A major concern in exploiting ultra-scale architectures for scientific high-end computing (HEC) with tens to hundreds of thousands of processors, such as the IBM Blue Gene/L and the Cray X1, is the potential inability to identify problems and take preemptive action before a failure impacts a running job. In fact, in systems of this scale, predictions estimate the mean time to interrupt in terms of hours. Current solutions for fault-tolerance in HEC focus on dealing with the result of a failure. However, most are unable to handle runtime system configuration changes caused by failures and require a complete restart of essential system services (e.g. MPI) or even of the entire machine. High availability (HA) computing strives to avoid the problems of unexpected failures through preemptive measures. There are various techniques to implement high availability. In contrast to active/hot-standby high availability with its fail-over model, active/active high availability with its virtual synchrony model is superior in many areas including scalability, throughput, availability and responsiveness. However, it is significantly more complex. The overall goal of our research is to expand today's effort in HA for HEC, so that systems that have the ability to hot-swap hardware components can be kept alive by an OS runtime environment that understands the concept of dynamic system configuration. This talk will present an overview of recent research at Oak Ridge National Laboratory in high availability solutions for ultra-scale scientific high-end computing." } @misc{scott06advancing, author = "Stephen L. Scott and Christian Engelmann", title = "Advancing Reliability, Availability and Serviceability for High-Performance Computing", month = apr # "~19, ", year = "2006", howpublished = "{Seminar at the Institute of Graphics and Parallel Processing, Johannes Kepler University, Linz, Austria}", url = "http://www.csm.ornl.gov/~engelman/publications/scott06advancing.ppt.pdf", abstract = "Today’s high performance computing systems have several reliability deficiencies resulting in noticeable availability and serviceability issues. For example, head and service nodes represent a single point of failure and control for an entire system as they render it inaccessible and unmanageable in case of a failure until repair, causing a significant downtime. Furthermore, current solutions for fault-tolerance focus on dealing with the result of a failure. However, most are unable to transparently mask runtime system configuration changes caused by failures and require a complete restart of essential system services, such as MPI, in case of a failure. High availability computing strives to avoid the problems of unexpected failures through preemptive measures. The overall goal of our research is to expand today’s effort in high availability for high-performance computing, so that systems can be kept alive by an OS runtime environment that understands the concepts of dynamic system configuration and degraded operation mode. This talk will present an overview of recent research performed at Oak Ridge National Laboratory in collaboration with Louisiana Tech University, North Carolina State University and the University of Reading in developing core technologies and proof-of-concept prototypes that improve the overall reliability, availability and serviceability of high-performance computing systems." } @misc{engelmann05high4, author = "Christian Engelmann", title = "High Availability for Ultra-Scale High-End Scientific Computing", month = oct # "~18, ", year = "2005", howpublished = "{Seminar at the Department of Computer Science, University of Reading, Reading, United Kingdom}", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann05high4.ppt.pdf", abstract = "A major concern in exploiting ultra-scale architectures for scientific high-end computing (HEC) with tens to hundreds of thousands of processors, such as the IBM Blue Gene/L and the Cray X1, is the potential inability to identify problems and take preemptive action before a failure impacts a running job. In fact, in systems of this scale, predictions estimate the mean time to interrupt in terms of hours. Current solutions for fault-tolerance in HEC focus on dealing with the result of a failure. However, most are unable to handle runtime system configuration changes caused by failures and require a complete restart of essential system services (e.g. MPI) or even of the entire machine. High availability (HA) computing strives to avoid the problems of unexpected failures through preemptive measures. There are various techniques to implement high availability. In contrast to active/hot-standby high availability with its fail-over model, active/active high availability with its virtual synchrony model is superior in many areas including scalability, throughput, availability and responsiveness. However, it is significantly more complex. The overall goal of our research is to expand today's effort in HA for HEC, so that systems that have the ability to hot-swap hardware components can be kept alive by an OS runtime environment that understands the concept of dynamic system configuration. This talk will present an overview of recent research at Oak Ridge National Laboratory in high availability solutions for ultra-scale scientific high-end computing." } @misc{engelmann05high3, author = "Christian Engelmann", title = "High Availability for Ultra-Scale High-End Scientific Computing", month = sep # "~26, ", year = "2005", howpublished = "{Seminar at the Department of Mathematics and Computer Science, Fayetteville State University, Fayetteville, NC, USA}", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann05high3.ppt.pdf", abstract = "A major concern in exploiting ultra-scale architectures for scientific high-end computing (HEC) with tens to hundreds of thousands of processors, such as the IBM Blue Gene/L and the Cray X1, is the potential inability to identify problems and take preemptive action before a failure impacts a running job. In fact, in systems of this scale, predictions estimate the mean time to interrupt in terms of hours. Current solutions for fault-tolerance in HEC focus on dealing with the result of a failure. However, most are unable to handle runtime system configuration changes caused by failures and require a complete restart of essential system services (e.g. MPI) or even of the entire machine. High availability (HA) computing strives to avoid the problems of unexpected failures through preemptive measures. There are various techniques to implement high availability. In contrast to active/hot-standby high availability with its fail-over model, active/active high availability with its virtual synchrony model is superior in many areas including scalability, throughput, availability and responsiveness. However, it is significantly more complex. The overall goal of our research is to expand today’s effort in HA for HEC, so that systems that have the ability to hot-swap hardware components can be kept alive by an OS runtime environment that understands the concept of dynamic system configuration. This talk will present an overview of recent research at Oak Ridge National Laboratory in fault tolerance and high availability solutions for ultra-scale scientific high-end computing." } @misc{engelmann05high2, author = "Christian Engelmann", title = "High Availability for Ultra-Scale High-End Scientific Computing", month = may # "~13, ", year = "2005", howpublished = "{Seminar at the Department of Computer Science, University of Reading, Reading, United Kingdom}", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann05high2.ppt.pdf", abstract = "A major concern in exploiting ultra-scale architectures for scientific high-end computing (HEC) with tens to hundreds of thousands of processors, such as the IBM Blue Gene/L and the Cray X1, is the potential inability to identify problems and take preemptive action before a failure impacts a running job. In fact, in systems of this scale, predictions estimate the mean time to interrupt in terms of hours. Current solutions for fault-tolerance in HEC focus on dealing with the result of a failure. However, most are unable to handle runtime system configuration changes caused by failures and require a complete restart of essential system services (e.g. MPI) or even of the entire machine. High availability (HA) computing strives to avoid the problems of unexpected failures through preemptive measures. There are various techniques to implement high availability. In contrast to active/hot-standby high availability with its fail-over model, active/active high availability with its virtual synchrony model is superior in many areas including scalability, throughput, availability and responsiveness. However, it is significantly more complex. The overall goal of our research is to expand today’s effort in HA for HEC, so that systems that have the ability to hot-swap hardware components can be kept alive by an OS runtime environment that understands the concept of dynamic system configuration. This talk will present an overview of recent research at Oak Ridge National Laboratory in fault-tolerant heterogeneous metacomputing, advanced super-scalable algorithms and high availability system software for ultra-scale scientific high-end computing." } @misc{engelmann05high1, author = "Christian Engelmann", title = "High Availability for Ultra-Scale High-End Scientific Computing", month = apr # "~15, ", year = "2005", howpublished = "{Seminar at the Center for Entrepreneurship and Information Technology, Louisiana Tech University, Ruston, LA, USA}", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann05high1.ppt.pdf", abstract = "A major concern in exploiting ultra-scale architectures for scientific high-end computing (HEC) with tens to hundreds of thousands of processors is the potential inability to identify problems and take preemptive action before a failure impacts a running job. In fact, in systems of this scale, predictions estimate the mean time to interrupt in terms of hours. Current solutions for fault-tolerance in HEC focus on dealing with the result of a failure. However, most are unable to handle runtime system configuration changes caused by failures and require a complete restart of essential system services (e.g. MPI) or even of the entire machine. High availability (HA) computing strives to avoid the problems of unexpected failures through preemptive measures. There are various techniques to implement high availability. In contrast to active/hot-standby high availability with its fail-over model, active/active high availability with its virtual synchrony model is superior in many areas including scalability, throughput, availability and responsiveness. However, it is significantly more complex. The overall goal of this research is to expand today’s effort in HA for HEC, so that systems that have the ability to hot-swap hardware components can be kept alive by an OS runtime environment that understands the concept of dynamic system configuration. With the aim of addressing the future challenges of high availability in ultra-scale HEC, this project intends to develop a proof-of-concept implementation of an active/active high availability system software framework." } @misc{engelmann04diskless, author = "Christian Engelmann", title = "Diskless Checkpointing on Super-scale Architectures -- {A}pplied to the Fast Fourier Transform", month = feb # "~25, ", year = "2004", howpublished = "{Invited talk at the $11^{th}$ SIAM Conference on Parallel Processing for Scientific Computing ({SIAM PP}) 2004, San Francisco, CA, USA}", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann04diskless.ppt.pdf", abstract = "This talk discusses the issue of fault-tolerance in distributed computer systems with tens or hundreds of thousands of diskless processor units. Such systems, like the IBM Blue Gene/L, are predicted to be deployed in the next five to ten years. Since a 100,000-processor system is going to be less reliable, scientific applications need to be able to recover from occurring failures more efficiently. In this paper, we adapt the present technique of diskless checkpointing to such huge distributed systems in order to equip existing scientific algorithms with super-scalable fault-tolerance. First, we discuss the method of diskless checkpointing, then we adapt this technique to super-scale architectures and finally we present results from an implementation of the Fast Fourier Transform that uses the adapted technique to achieve super-scale fault-tolerance." } @misc{engelmann04superscalable, author = "Christian Engelmann", title = "Super-scalable Algorithms -- {N}ext Generation Supercomputing on 100,000 and more Processors", month = jan # "~29, ", year = "2004", howpublished = "{Seminar at the Computer Science and Mathematics Division, Oak Ridge National Laboratory, Oak Ridge, TN, USA}", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann04superscalable.ppt.pdf", abstract = "This talk discusses recent research into the issues and potential problems of algorithm scalability and fault-tolerance on next-generation high-performance computer systems with tens and even hundreds of thousands of processors. Such massively parallel computers, like the IBM Blue Gene/L, are going to be deployed in the next five to ten years and existing deficiencies in scalability and fault-tolerance need to be addressed soon. Scientific algorithms have shown poor scalability on 10,000-processor systems that exist today. Furthermore, future systems will be less reliable due to the large number of components. Super-scalable algorithms, which have the properties of scale invariance and natural fault-tolerance, are able to get the correct answer despite multiple task failures and without checkpointing. We will show that such algorithms exist for a wide variety of problems, such as finite difference, finite element, multigrid and global maximum. Despite these findings, traditional algorithms may still be preferred due to their known behavior, or simply because a super-scalable algorithm does not exist or is hard to find for a particular problem. In this case, we propose a peer-to-peer diskless checkpointing algorithm that can provide scale invariant fault-tolerance." } @misc{engelmann03distributed, author = "Christian Engelmann", title = "Distributed Peer-to-Peer Control for {Harness}", month = feb # "~11, ", year = "2004", howpublished = "{Seminar at the Department of Computer Science, North Carolina State University, Raleigh, NC, USA}", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann03distributed.ppt.pdf", abstract = "Harness is an adaptable fault-tolerant virtual machine environment for next-generation heterogeneous distributed computing developed as a follow on to PVM. It additionally enables the assembly of applications from plug-ins and provides fault-tolerance. This work describes the distributed control, which manages global state replication to ensure a high-availability of service. Group communication services achieve an agreement on an initial global state and a linear history of global state changes at all members of the distributed virtual machine. This global state is replicated to all members to easily recover from single, multiple and cascaded faults. A peer-to-peer ring network architecture and tunable multi-point failure conditions provide heterogeneity and scalability. Finally, the integration of the distributed control into the multi-threaded kernel architecture of Harness offers a fault-tolerant global state database service for plug-ins and applications." } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Theses %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @phdthesis{engelmann08symmetric03, author = "Christian Engelmann", title = "Symmetric Active/Active High Availability for High-Performance Computing System Services", year = "2008", school = "Department of Computer Science, University of Reading, UK", note = "Thesis research performed at Oak Ridge National Laboratory. Thesis submitted to the University of Reading, awaiting defense. Advisor: Prof. Vassil N. Alexandrov (University of Reading).", abstract = "In order to address anticipated high failure rates, reliability, availability and serviceability have become an urgent priority for next-generation high-performance computing (HPC) systems. This thesis aims to pave the way for highly available HPC systems by focusing on their most critical components and by reinforcing them with appropriate high availability solutions. Service components, such as head and service nodes, are the ``Achilles heel'' of a HPC system. A failure typically results in a complete system-wide outage. This thesis targets efficient software state replication mechanisms for service component redundancy to achieve high availability as well as high performance. Its methodology relies on defining a modern theoretical foundation for providing service-level high availability, identifying availability deficiencies of HPC systems, and comparing various service-level high availability methods. This thesis showcases several developed proof-of-concept prototypes providing high availability for services running on HPC head and service nodes using the symmetric active/active replication method, {\em i.e.}, state-machine replication, to complement prior work in this area using active/standby and asymmetric active/active configurations. Presented contributions include a generic taxonomy for service high availability, an insight into availability deficiencies of HPC systems, and a unified definition of service-level high availability methods. Further contributions encompass a fully functional symmetric active/active high availability prototype for a HPC job and resource management service that does not require modification of service, a fully functional symmetric active/active high availability prototype for a HPC parallel file system metadata service that offers high performance, and two preliminary prototypes for a transparent symmetric active/active replication software framework for client-service and dependent service scenarios that hide the replication infrastructure from clients and services. Assuming a mean-time to failure of 5,000 hours for a head or service node, all presented prototypes improve service availability from 99.285\% to 99.994\% in a two-node system, and to 99.99996\% with three nodes." } @mastersthesis{engelmann01distributed2, author = "Christian Engelmann", title = "Distributed Peer-to-Peer Control for {Harness}", month = jul # "~7, ", year = "2001", school = "Department of Computer Science, University of Reading, UK", note = "Double diploma in conjunction with the Department of Engineering~I, Technical College for Engineering and Economics (FHTW) Berlin, Germany. Advisors: Prof. Vassil N. Alexandrov (University of Reading); George A. (Al) Geist (Oak Ridge National Laboratory).", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann01distributed2.pdf", abstract = "Parallel processing, the method of cutting down a large computational problem into many small tasks which are solved in parallel, is a field of increasing importance in science. Cost-effective, flexible and efficient simulations of mathematical models of physical, chemical or biological real-world problems are replacing the traditional experimental research. Current software solutions for parallel and scientific computation, like Parallel Virtual Machine and Message Passing Interface, have limitations in handling faults and failures, in utilizing heterogeneous and dynamically changing communication structures, and in enabling migrating or cooperative applications. The current research in heterogeneous adaptable reconfigurable networked systems (Harness) aims to produce the next generation of software solutions for distributed computing. A high-available and light-weighted distributed virtual machine service provides an encapsulation of a few hundred to a few thousand physical machines in a virtual heterogeneous large scale cluster. A high availability of a service in distributed systems can be achieved by replication of the service state on multiple server processes. If one ore more server processes fails, the surviving ones continue to provide the service because they know the state. Since every member of a distributed virtual machine is part of the distributed virtual machine service state and is able to change this state, a distributed control is needed to replicate the state and maintain its consistency. This distributed control manages state changes as well as the state-replication and the detection of and recovery from faults and failures of server processes. This work analyzes system architectures currently used in heterogeneous distributed computing by defining terms, conditions and assumptions. It shows that such systems are asynchronous and may use partially synchronous communication to detect and to distinguish different classes of faults and failures. It describes how a high availability of a large scale distributed service on a huge number of servers residing on different geographical locations can be realized. Asynchronous group communication services, such as Reliable Broadcast, Atomic Broadcast, Distributed Agreement and Membership, are analyzed to develop linear scalable algorithms in an unidirectional and in a bidirectional connected asynchronous peer-to-peer ring architecture. A Transaction Control group communication service is introduced as state-replication service. The system analysis distinguishes different types of distributed systems, where active transactions execute state changes using non-replicated data of one or more servers and inactive transactions report state changes using replicated data only. It is applicable for passive fault-tolerant distributed databases as well as for active fault-tolerant distributed control mechanisms. No control token is used and time stamps are avoided, so that all members of a server group have equal responsibilities and are independent from the system time. A prototype which implements the most complicated Transaction Control algorithm is realized due to the complexity of the distributed system and the early development stage of the introduced algorithms. The prototype is used to obtain practical experience with the state-replication algorithm." } @mastersthesis{engelmann01distributed, author = "Christian Engelmann", title = "Distributed Peer-to-Peer Control for {Harness}", month = feb # "~23, ", year = "2001", school = "Department of Engineering~I, Technical College for Engineering and Economics (FHTW) Berlin, Germany", note = "Double diploma in conjunction with the Department of Computer Science, University of Reading, UK. Advisors: Prof. Uwe Metzler (Technical College for Engineering and Economics (FHTW) Berlin); George A. (Al) Geist (Oak Ridge National Laboratory).", url = "http://www.csm.ornl.gov/~engelman/publications/engelmann01distributed.pdf", abstract = "Parallel processing, the method of cutting down a large computational problem into many small tasks which are solved in parallel, is a field of increasing importance in science. Cost-effective, flexible and efficient simulations of mathematical models of physical, chemical or biological real-world problems are replacing the traditional experimental research. Current software solutions for parallel and scientific computation, like Parallel Virtual Machine and Message Passing Interface, have limitations in handling faults and failures, in utilizing heterogeneous and dynamically changing communication structures, and in enabling migrating or cooperative applications. The current research in heterogeneous adaptable reconfigurable networked systems (Harness) aims to produce the next generation of software solutions for distributed computing. A high-available and light-weighted distributed virtual machine service provides an encapsulation of a few hundred to a few thousand physical machines in a virtual heterogeneous large scale cluster. A high availability of a service in distributed systems can be achieved by replication of the service state on multiple server processes. If one ore more server processes fails, the surviving ones continue to provide the service because they know the state. Since every member of a distributed virtual machine is part of the distributed virtual machine service state and is able to change this state, a distributed control is needed to replicate the state and maintain its consistency. This distributed control manages state changes as well as the state-replication and the detection of and recovery from faults and failures of server processes. This work analyzes system architectures currently used in heterogeneous distributed computing by defining terms, conditions and assumptions. It shows that such systems are asynchronous and may use partially synchronous communication to detect and to distinguish different classes of faults and failures. It describes how a high availability of a large scale distributed service on a huge number of servers residing on different geographical locations can be realized. Asynchronous group communication services, such as Reliable Broadcast, Atomic Broadcast, Distributed Agreement and Membership, are analyzed to develop linear scalable algorithms in an unidirectional and in a bidirectional connected asynchronous peer-to-peer ring architecture. A Transaction Control group communication service is introduced as state-replication service. The system analysis distinguishes different types of distributed systems, where active transactions execute state changes using non-replicated data of one or more servers and inactive transactions report state changes using replicated data only. It is applicable for passive fault-tolerant distributed databases as well as for active fault-tolerant distributed control mechanisms. No control token is used and time stamps are avoided, so that all members of a server group have equal responsibilities and are independent from the system time. A prototype which implements the most complicated Transaction Control algorithm is realized due to the complexity of the distributed system and the early development stage of the introduced algorithms. The prototype is used to obtain practical experience with the state-replication algorithm." } %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % Co-advised Theses %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @mastersthesis{koenning07virtualized, author = "Bj{\"o}rn K{\"o}nning", title = "Virtualized Environments for the {Harness Workbench}", month = mar # "~14, ", year = "2007", school = "Department of Computer Science, University of Reading, UK", note = "Advisors: Prof. Vassil N. Alexandrov (University of Reading); Christian Engelmann (Oak Ridge National Laboratory).", url = "http://www.csm.ornl.gov/~engelman/students/koenning07virtualized.pdf", abstract = "The expanded use of computational sciences today leads to a significant need of high performance computing systems. High performance computing is currently undergoing vigorous revival, and multiple efforts are underway to develop much faster computing systems in the near future. New software tools are required for the efficient use of petascale computing systems. With the new Harness Workbench Project the Oak Ridge National Laboratory intends to develop an appropriate development and runtime environment for high performance computing platforms. This dissertation project is part of the Harness Workbench Project, and deals with the development of a concept for virtualised environments and various approaches to create and describe them. The developed virtualisation approach is based on the \verb|chroot| mechanism and uses platform-independent environment descriptions. File structures and environment variables are emulated to provide the portability of computational software over diverse high performance computing platforms. Security measures and sandbox characteristic are integrable." } @mastersthesis{weber07high, author = "Matthias Weber", title = "High Availability for the {Lustre} File System", month = mar # "~14, ", year = "2007", school = "Department of Computer Science, University of Reading, UK", note = "Double diploma in conjunction with the Department of Engineering~I, Technical College for Engineering and Economics (FHTW) Berlin, Germany. Advisors: Prof. Vassil N. Alexandrov (University of Reading); Christian Engelmann (Oak Ridge National Laboratory).", url = "http://www.csm.ornl.gov/~engelman/students/weber07high.pdf", abstract = "With the growing importance of high performance computing and, more importantly, the fast growing size of sophisticated high performance computing systems, research in the area of high availability is essential to meet the needs to sustain the current growth. This Master thesis project aims to improve the availability of Lustre. Major concern of this project is the metadata server of the file system. The metadata server of Lustre suffers from the last single point of failure in the file system. To overcome this single point of failure an active/active high availability approach is introduced. The new file system design with multiple MDS nodes running in virtual synchrony leads to a significant increase of availability. Two prototype implementations aim to show how the proposed system design and its new realized form of symmetric active/active high availability can be accomplished in practice. The results of this work point out the difficulties in adapting the file system to the active/active high availability design. Tests identify not achieved functionality and show performance problems of the proposed solution. The findings of this dissertation may be used for further work on high availability for distributed file systems." } @mastersthesis{baumann06design, author = "Ronald Baumann", title = "Design and Development of Prototype Components for the {Harness} High-Performance Computing Workbench", month = mar # "~6, ", year = "2006", school = "Department of Computer Science, University of Reading, UK", note = "Double diploma in conjunction with the Department of Engineering~I, Technical College for Engineering and Economics (FHTW) Berlin, Germany. Advisors: Prof. Vassil N. Alexandrov (University of Reading); George A. (Al) Geist and Christian Engelmann (Oak Ridge National Laboratory).", url = "http://www.csm.ornl.gov/~engelman/students/baumann06design.pdf", abstract = "This master thesis examines plug-in technology, especially the new field of parallel plug-ins. Plug-ins are popular because they extend the capabilities of software packages such as browsers and Photoshop, and allow an individual user to add new functionality. Parallel plug-ins also provide the above capabilities to a distributed set of resources, i.e., a plug-in now becomes a set of coordinating plug-ins. Second, the set of plugins may be heterogeneous either in function or because the underlying resources are heterogeneous. This new dimension of complexity provides a rich research space which is explored in this thesis. Experiences are collected and presented as parallel plug-in paradigms and concepts. The Harness framework was used in this project, in particular the plugin manager and available communication capabilities. Plug-ins provide methods for users to extend Harness according to their requirements. The result of this thesis is a parallel plug-in paradigm and template for Harness. Users of the Harness environment will be able to design and implement their applications in the form of parallel plug-ins easier and faster by using the paradigm resulting from this project. Prototypes were implemented which handle different aspects of parallel plug-ins. Parallel plug-in configurations were tested on an appropriate number of Harness kernels, including available communication and error-handling capabilities. Furthermore, research was done in the area of fault tolerance while parallel plug-ins are (un)loaded, as well as while a task is performed." } @mastersthesis{uhlemann06high, author = "Kai Uhlemann", title = "High Availability for High-End Scientific Computing", month = mar # "~6, ", year = 2006, school = "Department of Computer Science, University of Reading, UK", note = "Double diploma in conjunction with the Department of Engineering~I, Technical College for Engineering and Economics (FHTW) Berlin, Germany. Advisors: Prof. Vassil N. Alexandrov (University of Reading); George A. (Al) Geist and Christian Engelmann (Oak Ridge National Laboratory).", url = "http://www.csm.ornl.gov/~engelman/students/uhlemann06high.pdf", abstract = "With the growing interest and popularity in high performance cluster computing and, more importantly, the fast growing size of compute clusters, research in the area of high availability is essential to meet the needs to sustain the current growth. This Master thesis project introduces a new approach for high availability focusing on the head node of a cluster system. This projects focus is on providing high availability to the job scheduler service, which is the most vital part of the traditional Beowulf-style cluster architecture. This research seeks to add high availability to the job scheduler service and resource management system, typically running on the head node, leading to a significant increase of availability for cluster computing. Also, this software project takes advantage of the virtual synchrony paradigm to achieve active/active replication, the highest form of high availability. A proof-of-concept implementation shows how high availability can be designed in software and what results can be expected of such a system. The results may be reused for future or existing projects to further improve and extent the high availability of compute clusters." }