Parse and assemble kernel functions (#567)

* Parse and assemble kernel functions

Written in "EVM++" assembly. Later on we will add some priviledged opcodes (in unused opcode ordinals), making it an extension of EVM bytecode.

I don't think there's much of a standard for EVM assembly, but I loosely based the syntax on this [proposal](https://gist.github.com/axic/17ddbbce4738ccf4040d30cbb5de484e).

* PR feedback

* tweaks for consistency

* terminology tweaks

* Update evm/src/cpu/kernel/opcodes.rs

Co-authored-by: Jacqueline Nabaglo <jakub@mirprotocol.org>

* Update evm/src/cpu/kernel/opcodes.rs

Co-authored-by: Jacqueline Nabaglo <jakub@mirprotocol.org>

* Update evm/src/cpu/kernel/opcodes.rs

Co-authored-by: Jacqueline Nabaglo <jakub@mirprotocol.org>

Co-authored-by: Jacqueline Nabaglo <jakub@mirprotocol.org>
This commit is contained in:
Daniel Lubarov 2022-06-20 20:32:29 -07:00 committed by GitHub
parent 2797000377
commit 2e818172f0
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 562 additions and 9 deletions

View File

@ -9,8 +9,11 @@ plonky2 = { path = "../plonky2" }
plonky2_util = { path = "../util" }
anyhow = "1.0.40"
env_logger = "0.9.0"
ethereum-types = "0.13.1"
itertools = "0.10.0"
log = "0.4.14"
pest = "2.1.3"
pest_derive = "2.1.0"
rayon = "1.5.1"
rand = "0.8.5"
rand_chacha = "0.3.1"

View File

@ -1,15 +1,21 @@
use std::ops::Range;
// Filter. 1 if the row corresponds to a cycle of execution and 0 otherwise.
// Lets us re-use decode columns in non-cycle rows.
pub const IS_CPU_CYCLE: usize = 0;
/// Filter. 1 if the row is part of bootstrapping the kernel code, 0 otherwise.
pub const IS_BOOTSTRAP_KERNEL: usize = 0;
// If CPU cycle: The opcode being decoded, in {0, ..., 255}.
/// Filter. 1 if the row is part of bootstrapping a contract's code, 0 otherwise.
pub const IS_BOOTSTRAP_CONTRACT: usize = IS_BOOTSTRAP_KERNEL + 1;
/// Filter. 1 if the row corresponds to a cycle of execution and 0 otherwise.
/// Lets us re-use decode columns in non-cycle rows.
pub const IS_CPU_CYCLE: usize = IS_BOOTSTRAP_CONTRACT + 1;
/// If CPU cycle: The opcode being decoded, in {0, ..., 255}.
pub const OPCODE: usize = IS_CPU_CYCLE + 1;
// If CPU cycle: flags for EVM instructions. PUSHn, DUPn, and SWAPn only get one flag each. Invalid
// opcodes are split between a number of flags for practical reasons. Exactly one of these flags
// must be 1.
/// If CPU cycle: flags for EVM instructions. PUSHn, DUPn, and SWAPn only get one flag each. Invalid
/// opcodes are split between a number of flags for practical reasons. Exactly one of these flags
/// must be 1.
pub const IS_STOP: usize = OPCODE + 1;
pub const IS_ADD: usize = IS_STOP + 1;
pub const IS_MUL: usize = IS_ADD + 1;
@ -121,8 +127,8 @@ pub const IS_INVALID_20: usize = IS_INVALID_19 + 1;
pub const START_INSTRUCTION_FLAGS: usize = IS_STOP;
pub const END_INSTRUCTION_FLAGS: usize = IS_INVALID_20 + 1;
// If CPU cycle: the opcode, broken up into bits.
// **big-endian** order
/// If CPU cycle: the opcode, broken up into bits.
/// **Big-endian** order.
pub const OPCODE_BITS: [usize; 8] = [
END_INSTRUCTION_FLAGS,
END_INSTRUCTION_FLAGS + 1,

View File

@ -0,0 +1,28 @@
//! Loads each kernel assembly file and concatenates them.
use itertools::Itertools;
use super::assembler::{assemble, Kernel};
use crate::cpu::kernel::parser::parse;
#[allow(dead_code)] // TODO: Should be used once witness generation is done.
pub(crate) fn combined_kernel() -> Kernel {
let files = vec![
include_str!("asm/storage_read.asm"),
include_str!("asm/storage_write.asm"),
];
let parsed_files = files.iter().map(|f| parse(f)).collect_vec();
assemble(parsed_files)
}
#[cfg(test)]
mod tests {
use crate::cpu::kernel::aggregator::combined_kernel;
#[test]
fn make_kernel() {
// Make sure we can parse and assemble the entire kernel.
combined_kernel();
}
}

View File

@ -0,0 +1,10 @@
// TODO: Dummy code for now.
global storage_read:
JUMPDEST
PUSH 1234
POP
// An infinite loop:
mylabel:
JUMPDEST
PUSH mylabel
JUMP

View File

@ -0,0 +1,6 @@
// TODO: Dummy code for now.
global storage_write:
JUMPDEST
PUSH 123 // Whatever.
POP
BYTES 0x1, 0x02, 3

View File

@ -0,0 +1,207 @@
use std::collections::HashMap;
use super::ast::PushTarget;
use crate::cpu::kernel::{
ast::{File, Item},
opcodes::{get_opcode, get_push_opcode},
};
/// The number of bytes to push when pushing an offset within the code (i.e. when assembling jumps).
/// Ideally we would automatically use the minimal number of bytes required, but that would be
/// nontrivial given the circular dependency between an offset and its size.
const BYTES_PER_OFFSET: u8 = 3;
#[derive(PartialEq, Eq, Debug)]
pub struct Kernel {
code: Vec<u8>,
global_labels: HashMap<String, usize>,
}
pub(crate) fn assemble(files: Vec<File>) -> Kernel {
let mut code = vec![];
let mut global_labels = HashMap::new();
for file in files {
assemble_file(file.body, &mut code, &mut global_labels);
}
Kernel {
code,
global_labels,
}
}
fn assemble_file(body: Vec<Item>, code: &mut Vec<u8>, global_labels: &mut HashMap<String, usize>) {
// First discover the offset of each label in this file.
let mut local_labels = HashMap::<String, usize>::new();
let mut offset = code.len();
for item in &body {
match item {
Item::GlobalLabelDeclaration(label) => {
let old = global_labels.insert(label.clone(), offset);
assert!(old.is_none(), "Duplicate global label: {}", label);
}
Item::LocalLabelDeclaration(label) => {
let old = local_labels.insert(label.clone(), offset);
assert!(old.is_none(), "Duplicate local label: {}", label);
}
Item::Push(target) => offset += 1 + push_target_size(target) as usize,
Item::StandardOp(_) => offset += 1,
Item::Bytes(bytes) => offset += bytes.len(),
}
}
// Now that we have label offsets, we can assemble the file.
for item in body {
match item {
Item::GlobalLabelDeclaration(_) | Item::LocalLabelDeclaration(_) => {
// Nothing to do; we processed labels in the prior phase.
}
Item::Push(target) => {
let target_bytes: Vec<u8> = match target {
PushTarget::Literal(literal) => literal.to_trimmed_be_bytes(),
PushTarget::Label(label) => {
let offset = local_labels[&label];
// We want the BYTES_PER_OFFSET least significant bytes in BE order.
// It's easiest to rev the first BYTES_PER_OFFSET bytes of the LE encoding.
(0..BYTES_PER_OFFSET)
.rev()
.map(|i| offset.to_le_bytes()[i as usize])
.collect()
}
};
code.push(get_push_opcode(target_bytes.len() as u8));
code.extend(target_bytes);
}
Item::StandardOp(opcode) => {
code.push(get_opcode(&opcode));
}
Item::Bytes(bytes) => code.extend(bytes.iter().map(|b| b.to_u8())),
}
}
assert_eq!(
code.len(),
offset,
"The two phases gave different code lengths"
);
}
/// The size of a `PushTarget`, in bytes.
fn push_target_size(target: &PushTarget) -> u8 {
match target {
PushTarget::Literal(lit) => lit.to_trimmed_be_bytes().len() as u8,
PushTarget::Label(_) => BYTES_PER_OFFSET,
}
}
#[cfg(test)]
mod tests {
use std::collections::HashMap;
use crate::cpu::kernel::{assembler::*, ast::*};
#[test]
fn two_files() {
// We will test two simple files, with a label and a jump, to ensure that jump offsets
// are correctly shifted based on the offset of the containing file.
let file_1 = File {
body: vec![
Item::GlobalLabelDeclaration("function_1".to_string()),
Item::StandardOp("JUMPDEST".to_string()),
Item::StandardOp("ADD".to_string()),
Item::StandardOp("MUL".to_string()),
],
};
let file_2 = File {
body: vec![
Item::GlobalLabelDeclaration("function_2".to_string()),
Item::StandardOp("JUMPDEST".to_string()),
Item::StandardOp("DIV".to_string()),
Item::LocalLabelDeclaration("mylabel".to_string()),
Item::StandardOp("JUMPDEST".to_string()),
Item::StandardOp("MOD".to_string()),
Item::Push(PushTarget::Label("mylabel".to_string())),
Item::StandardOp("JUMP".to_string()),
],
};
let expected_code = vec![
get_opcode("JUMPDEST"),
get_opcode("ADD"),
get_opcode("MUL"),
get_opcode("JUMPDEST"),
get_opcode("DIV"),
get_opcode("JUMPDEST"),
get_opcode("MOD"),
get_push_opcode(BYTES_PER_OFFSET),
// The label offset, 5, in 3-byte BE form.
0,
0,
5,
get_opcode("JUMP"),
];
let mut expected_global_labels = HashMap::new();
expected_global_labels.insert("function_1".to_string(), 0);
expected_global_labels.insert("function_2".to_string(), 3);
let expected_kernel = Kernel {
code: expected_code,
global_labels: expected_global_labels,
};
let program = vec![file_1, file_2];
assert_eq!(assemble(program), expected_kernel);
}
#[test]
#[should_panic]
fn global_label_collision() {
let file_1 = File {
body: vec![
Item::GlobalLabelDeclaration("foo".to_string()),
Item::StandardOp("JUMPDEST".to_string()),
],
};
let file_2 = File {
body: vec![
Item::GlobalLabelDeclaration("foo".to_string()),
Item::StandardOp("JUMPDEST".to_string()),
],
};
assemble(vec![file_1, file_2]);
}
#[test]
#[should_panic]
fn local_label_collision() {
let file = File {
body: vec![
Item::LocalLabelDeclaration("foo".to_string()),
Item::StandardOp("JUMPDEST".to_string()),
Item::LocalLabelDeclaration("foo".to_string()),
Item::StandardOp("ADD".to_string()),
],
};
assemble(vec![file]);
}
#[test]
fn literal_bytes() {
let file = File {
body: vec![
Item::Bytes(vec![
Literal::Hex("12".to_string()),
Literal::Decimal("42".to_string()),
]),
Item::Bytes(vec![
Literal::Hex("fe".to_string()),
Literal::Decimal("255".to_string()),
]),
],
};
let code = assemble(vec![file]).code;
assert_eq!(code, vec![0x12, 42, 0xfe, 255])
}
}

84
evm/src/cpu/kernel/ast.rs Normal file
View File

@ -0,0 +1,84 @@
use ethereum_types::U256;
use plonky2_util::ceil_div_usize;
#[derive(Debug)]
pub(crate) struct File {
pub(crate) body: Vec<Item>,
}
#[derive(Debug)]
pub(crate) enum Item {
/// Declares a global label.
GlobalLabelDeclaration(String),
/// Declares a label that is local to the current file.
LocalLabelDeclaration(String),
/// A `PUSH` operation.
Push(PushTarget),
/// Any opcode besides a PUSH opcode.
StandardOp(String),
/// Literal hex data; should contain an even number of hex chars.
Bytes(Vec<Literal>),
}
/// The target of a `PUSH` operation.
#[derive(Debug)]
pub(crate) enum PushTarget {
Literal(Literal),
Label(String),
}
#[derive(Debug)]
pub(crate) enum Literal {
Decimal(String),
Hex(String),
}
impl Literal {
pub(crate) fn to_trimmed_be_bytes(&self) -> Vec<u8> {
let u256 = self.to_u256();
let num_bytes = ceil_div_usize(u256.bits(), 8);
// `byte` is little-endian, so we manually reverse it.
(0..num_bytes).rev().map(|i| u256.byte(i)).collect()
}
pub(crate) fn to_u256(&self) -> U256 {
let (src, radix) = match self {
Literal::Decimal(s) => (s, 10),
Literal::Hex(s) => (s, 16),
};
U256::from_str_radix(src, radix)
.unwrap_or_else(|_| panic!("Not a valid u256 literal: {:?}", self))
}
pub(crate) fn to_u8(&self) -> u8 {
let (src, radix) = match self {
Literal::Decimal(s) => (s, 10),
Literal::Hex(s) => (s, 16),
};
u8::from_str_radix(src, radix)
.unwrap_or_else(|_| panic!("Not a valid u8 literal: {:?}", self))
}
}
#[cfg(test)]
mod tests {
use crate::cpu::kernel::ast::*;
#[test]
fn literal_to_be_bytes() {
assert_eq!(
Literal::Decimal("768".into()).to_trimmed_be_bytes(),
vec![0x03, 0x00]
);
assert_eq!(
Literal::Hex("a1b2".into()).to_trimmed_be_bytes(),
vec![0xa1, 0xb2]
);
assert_eq!(
Literal::Hex("1b2".into()).to_trimmed_be_bytes(),
vec![0x1, 0xb2]
);
}
}

View File

@ -0,0 +1,23 @@
// Grammar for our EVM assembly code.
// Loosely based on https://gist.github.com/axic/17ddbbce4738ccf4040d30cbb5de484e
WHITESPACE = _{ " " | "\t" | NEWLINE }
COMMENT = _{ "/*" ~ (!"*/" ~ ANY)* ~ "*/" | "//" ~ (!NEWLINE ~ ANY)* ~ NEWLINE }
identifier_first_char = _{ ASCII_ALPHA | "_" }
identifier_char = _{ ASCII_ALPHANUMERIC | "_" }
identifier = @{ identifier_first_char ~ identifier_char* }
literal_decimal = @{ ASCII_DIGIT+ }
literal_hex = @{ ^"0x" ~ ASCII_HEX_DIGIT+ }
literal = { literal_hex | literal_decimal }
item = { global_label | local_label | bytes_item | push_instruction | nullary_instruction }
global_label = { ^"GLOBAL " ~ identifier ~ ":" }
local_label = { identifier ~ ":" }
bytes_item = { ^"BYTES " ~ literal ~ ("," ~ literal)* }
push_instruction = { ^"PUSH " ~ (literal | identifier) }
nullary_instruction = { identifier }
file = { SOI ~ item* ~ silent_eoi }
silent_eoi = _{ !ANY }

View File

@ -0,0 +1,5 @@
pub mod aggregator;
mod assembler;
mod ast;
mod opcodes;
mod parser;

View File

@ -0,0 +1,123 @@
/// The opcode of the `PUSH[n]` instruction, given a byte count `n`.
pub(crate) fn get_push_opcode(n: u8) -> u8 {
assert!(n > 0);
assert!(n <= 32);
0x60 + (n as u8 - 1)
}
/// The opcode of a standard instruction (not a `PUSH`).
pub(crate) fn get_opcode(mnemonic: &str) -> u8 {
match mnemonic.to_uppercase().as_str() {
"STOP" => 0x00,
"ADD" => 0x01,
"MUL" => 0x02,
"SUB" => 0x03,
"DIV" => 0x04,
"SDIV" => 0x05,
"MOD" => 0x06,
"SMOD" => 0x07,
"ADDMOD" => 0x08,
"MULMOD" => 0x09,
"EXP" => 0x0a,
"SIGNEXTEND" => 0x0b,
"LT" => 0x10,
"GT" => 0x11,
"SLT" => 0x12,
"SGT" => 0x13,
"EQ" => 0x14,
"ISZERO" => 0x15,
"AND" => 0x16,
"OR" => 0x17,
"XOR" => 0x18,
"NOT" => 0x19,
"BYTE" => 0x1a,
"SHL" => 0x1b,
"SHR" => 0x1c,
"SAR" => 0x1d,
"KECCAK256" => 0x20,
"ADDRESS" => 0x30,
"BALANCE" => 0x31,
"ORIGIN" => 0x32,
"CALLER" => 0x33,
"CALLVALUE" => 0x34,
"CALLDATALOAD" => 0x35,
"CALLDATASIZE" => 0x36,
"CALLDATACOPY" => 0x37,
"CODESIZE" => 0x38,
"CODECOPY" => 0x39,
"GASPRICE" => 0x3a,
"EXTCODESIZE" => 0x3b,
"EXTCODECOPY" => 0x3c,
"RETURNDATASIZE" => 0x3d,
"RETURNDATACOPY" => 0x3e,
"EXTCODEHASH" => 0x3f,
"BLOCKHASH" => 0x40,
"COINBASE" => 0x41,
"TIMESTAMP" => 0x42,
"NUMBER" => 0x43,
"DIFFICULTY" => 0x44,
"GASLIMIT" => 0x45,
"CHAINID" => 0x46,
"BASEFEE" => 0x48,
"POP" => 0x50,
"MLOAD" => 0x51,
"MSTORE" => 0x52,
"MSTORE8" => 0x53,
"SLOAD" => 0x54,
"SSTORE" => 0x55,
"JUMP" => 0x56,
"JUMPI" => 0x57,
"GETPC" => 0x58,
"MSIZE" => 0x59,
"GAS" => 0x5a,
"JUMPDEST" => 0x5b,
"DUP1" => 0x80,
"DUP2" => 0x81,
"DUP3" => 0x82,
"DUP4" => 0x83,
"DUP5" => 0x84,
"DUP6" => 0x85,
"DUP7" => 0x86,
"DUP8" => 0x87,
"DUP9" => 0x88,
"DUP10" => 0x89,
"DUP11" => 0x8a,
"DUP12" => 0x8b,
"DUP13" => 0x8c,
"DUP14" => 0x8d,
"DUP15" => 0x8e,
"DUP16" => 0x8f,
"SWAP1" => 0x90,
"SWAP2" => 0x91,
"SWAP3" => 0x92,
"SWAP4" => 0x93,
"SWAP5" => 0x94,
"SWAP6" => 0x95,
"SWAP7" => 0x96,
"SWAP8" => 0x97,
"SWAP9" => 0x98,
"SWAP10" => 0x99,
"SWAP11" => 0x9a,
"SWAP12" => 0x9b,
"SWAP13" => 0x9c,
"SWAP14" => 0x9d,
"SWAP15" => 0x9e,
"SWAP16" => 0x9f,
"LOG0" => 0xa0,
"LOG1" => 0xa1,
"LOG2" => 0xa2,
"LOG3" => 0xa3,
"LOG4" => 0xa4,
"CREATE" => 0xf0,
"CALL" => 0xf1,
"CALLCODE" => 0xf2,
"RETURN" => 0xf3,
"DELEGATECALL" => 0xf4,
"CREATE2" => 0xf5,
"STATICCALL" => 0xfa,
"REVERT" => 0xfd,
"INVALID" => 0xfe,
"SELFDESTRUCT" => 0xff,
_ => panic!("Unrecognized mnemonic {}", mnemonic),
}
}

View File

@ -0,0 +1,57 @@
use pest::iterators::Pair;
use pest::Parser;
use crate::cpu::kernel::ast::{File, Item, Literal, PushTarget};
/// Parses EVM assembly code.
#[derive(pest_derive::Parser)]
#[grammar = "cpu/kernel/evm_asm.pest"]
pub struct AsmParser;
pub(crate) fn parse(s: &str) -> File {
let file = AsmParser::parse(Rule::file, s)
.expect("Parsing failed")
.next()
.unwrap();
let body = file.into_inner().map(parse_item).collect();
File { body }
}
fn parse_item(item: Pair<Rule>) -> Item {
let item = item.into_inner().next().unwrap();
match item.as_rule() {
Rule::global_label => {
Item::GlobalLabelDeclaration(item.into_inner().next().unwrap().as_str().into())
}
Rule::local_label => {
Item::LocalLabelDeclaration(item.into_inner().next().unwrap().as_str().into())
}
Rule::bytes_item => Item::Bytes(item.into_inner().map(parse_literal).collect()),
Rule::push_instruction => Item::Push(parse_push_target(item.into_inner().next().unwrap())),
Rule::nullary_instruction => Item::StandardOp(item.as_str().into()),
_ => panic!("Unexpected {:?}", item.as_rule()),
}
}
fn parse_push_target(target: Pair<Rule>) -> PushTarget {
match target.as_rule() {
Rule::identifier => PushTarget::Label(target.as_str().into()),
Rule::literal => PushTarget::Literal(parse_literal(target)),
_ => panic!("Unexpected {:?}", target.as_rule()),
}
}
fn parse_literal(literal: Pair<Rule>) -> Literal {
let literal = literal.into_inner().next().unwrap();
match literal.as_rule() {
Rule::literal_decimal => Literal::Decimal(literal.as_str().into()),
Rule::literal_hex => Literal::Hex(parse_hex(literal)),
_ => panic!("Unexpected {:?}", literal.as_rule()),
}
}
fn parse_hex(hex: Pair<Rule>) -> String {
let prefix = &hex.as_str()[..2];
debug_assert!(prefix == "0x" || prefix == "0X");
hex.as_str()[2..].to_string()
}

View File

@ -1,4 +1,5 @@
pub(crate) mod columns;
pub mod cpu_stark;
pub(crate) mod decode;
pub mod kernel;
mod simple_logic;