diff --git a/Makefile.in b/Makefile.in
index bb1f14b..bce57b7 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -181,7 +181,7 @@ LIBOBJS0 = alter.lo analyze.lo attach.lo auth.lo \
notify.lo opcodes.lo os.lo os_unix.lo os_win.lo \
pager.lo parse.lo pcache.lo pcache1.lo pragma.lo prepare.lo printf.lo \
random.lo resolve.lo rowset.lo rtree.lo \
- sqlite3session.lo select.lo sqlite3rbu.lo status.lo \
+ sqlite3session.lo select.lo sqlite3rbu.lo status.lo stmt.lo \
table.lo threads.lo tokenize.lo treeview.lo trigger.lo \
update.lo util.lo vacuum.lo \
vdbe.lo vdbeapi.lo vdbeaux.lo vdbeblob.lo vdbemem.lo vdbesort.lo \
@@ -350,7 +350,8 @@ SRC += \
$(TOP)/ext/rbu/sqlite3rbu.h \
$(TOP)/ext/rbu/sqlite3rbu.c
SRC += \
- $(TOP)/ext/misc/json1.c
+ $(TOP)/ext/misc/json1.c \
+ $(TOP)/ext/misc/stmt.c
# Generated source code files
#
@@ -430,9 +431,11 @@ TESTSRC += \
$(TOP)/ext/misc/nextchar.c \
$(TOP)/ext/misc/percentile.c \
$(TOP)/ext/misc/regexp.c \
+ $(TOP)/ext/misc/remember.c \
$(TOP)/ext/misc/series.c \
$(TOP)/ext/misc/spellfix.c \
$(TOP)/ext/misc/totype.c \
+ $(TOP)/ext/misc/unionvtab.c \
$(TOP)/ext/misc/wholenumber.c
# Source code to the library files needed by the test fixture
@@ -482,7 +485,8 @@ TESTSRC2 = \
$(TOP)/ext/fts3/fts3_tokenizer.c \
$(TOP)/ext/fts3/fts3_write.c \
$(TOP)/ext/async/sqlite3async.c \
- $(TOP)/ext/session/sqlite3session.c
+ $(TOP)/ext/session/sqlite3session.c \
+ $(TOP)/ext/misc/stmt.c
# Header files used by all library source files.
#
@@ -550,7 +554,8 @@ FUZZDATA = \
$(TOP)/test/fuzzdata1.db \
$(TOP)/test/fuzzdata2.db \
$(TOP)/test/fuzzdata3.db \
- $(TOP)/test/fuzzdata4.db
+ $(TOP)/test/fuzzdata4.db \
+ $(TOP)/test/fuzzdata5.db
# Standard options to testfixture
#
@@ -562,8 +567,12 @@ SHELL_OPT = -DSQLITE_ENABLE_JSON1 -DSQLITE_ENABLE_FTS4
# SHELL_OPT += -DSQLITE_ENABLE_FTS5
SHELL_OPT += -DSQLITE_ENABLE_EXPLAIN_COMMENTS
SHELL_OPT += -DSQLITE_ENABLE_UNKNOWN_SQL_FUNCTION
+SHELL_OPT += -DSQLITE_ENABLE_STMTVTAB
FUZZERSHELL_OPT = -DSQLITE_ENABLE_JSON1
-FUZZCHECK_OPT = -DSQLITE_ENABLE_JSON1 -DSQLITE_ENABLE_MEMSYS5
+FUZZCHECK_OPT = -DSQLITE_ENABLE_JSON1 -DSQLITE_ENABLE_MEMSYS5 -DSQLITE_OSS_FUZZ
+FUZZCHECK_OPT += -DSQLITE_MAX_MEMORY=50000000
+FUZZCHECK_SRC = $(TOP)/test/fuzzcheck.c $(TOP)/test/ossfuzz.c
+DBFUZZ_OPT =
# This is the default Makefile target. The objects listed here
# are what get build when you type just "make" with no arguments.
@@ -612,8 +621,15 @@ fuzzershell$(TEXE): $(TOP)/tool/fuzzershell.c sqlite3.c sqlite3.h
$(LTLINK) -o $@ $(FUZZERSHELL_OPT) \
$(TOP)/tool/fuzzershell.c sqlite3.c $(TLIBS)
-fuzzcheck$(TEXE): $(TOP)/test/fuzzcheck.c sqlite3.c sqlite3.h
- $(LTLINK) -o $@ $(FUZZCHECK_OPT) $(TOP)/test/fuzzcheck.c sqlite3.c $(TLIBS)
+fuzzcheck$(TEXE): $(FUZZCHECK_SRC) sqlite3.c sqlite3.h
+ $(LTLINK) -o $@ $(FUZZCHECK_OPT) $(FUZZCHECK_SRC) sqlite3.c $(TLIBS)
+
+ossshell$(TEXE): $(TOP)/test/ossfuzz.c $(TOP)/test/ossshell.c sqlite3.c sqlite3.h
+ $(LTLINK) -o $@ $(FUZZCHECK_OPT) $(TOP)/test/ossshell.c \
+ $(TOP)/test/ossfuzz.c sqlite3.c $(TLIBS)
+
+dbfuzz$(TEXE): $(TOP)/test/dbfuzz.c sqlite3.c sqlite3.h
+ $(LTLINK) -o $@ $(DBFUZZ_OPT) $(TOP)/test/dbfuzz.c sqlite3.c $(TLIBS)
mptester$(TEXE): sqlite3.lo $(TOP)/mptest/mptest.c
$(LTLINK) -o $@ -I. $(TOP)/mptest/mptest.c sqlite3.lo \
@@ -1022,6 +1038,9 @@ sqlite3session.lo: $(TOP)/ext/session/sqlite3session.c $(HDR) $(EXTHDR)
json1.lo: $(TOP)/ext/misc/json1.c
$(LTCOMPILE) -DSQLITE_CORE -c $(TOP)/ext/misc/json1.c
+stmt.lo: $(TOP)/ext/misc/stmt.c
+ $(LTCOMPILE) -DSQLITE_CORE -c $(TOP)/ext/misc/stmt.c
+
# FTS5 things
#
FTS5_SRC = \
@@ -1071,6 +1090,7 @@ TESTFIXTURE_FLAGS += -DSQLITE_SERVER=1 -DSQLITE_PRIVATE="" -DSQLITE_CORE
TESTFIXTURE_FLAGS += -DBUILD_sqlite
TESTFIXTURE_FLAGS += -DSQLITE_SERIES_CONSTRAINT_VERIFY=1
TESTFIXTURE_FLAGS += -DSQLITE_DEFAULT_PAGE_SIZE=1024
+TESTFIXTURE_FLAGS += -DSQLITE_ENABLE_STMTVTAB
TESTFIXTURE_SRC0 = $(TESTSRC2) libsqlite3.la
TESTFIXTURE_SRC1 = sqlite3.c
@@ -1103,6 +1123,11 @@ fastfuzztest: fuzzcheck$(TEXE) $(FUZZDATA)
valgrindfuzz: fuzzcheck$(TEXT) $(FUZZDATA)
valgrind ./fuzzcheck$(TEXE) --cell-size-check --limit-mem 10M --timeout 600 $(FUZZDATA)
+# The veryquick.test TCL tests.
+#
+tcltest: ./testfixture$(TEXE)
+ ./testfixture$(TEXE) $(TOP)/test/veryquick.test $(TESTOPTS)
+
# Minimal testing that runs in less than 3 minutes
#
quicktest: ./testfixture$(TEXE)
@@ -1111,8 +1136,7 @@ quicktest: ./testfixture$(TEXE)
# This is the common case. Run many tests that do not take too long,
# including fuzzcheck, sqlite3_analyzer, and sqldiff tests.
#
-test: $(TESTPROGS) sourcetest fastfuzztest
- ./testfixture$(TEXE) $(TOP)/test/veryquick.test $(TESTOPTS)
+test: fastfuzztest sourcetest $(TESTPROGS) tcltest
# Run a test using valgrind. This can take a really long time
# because valgrind is so much slower than a native machine.
@@ -1139,6 +1163,10 @@ sqlite3_analyzer.c: sqlite3.c $(TOP)/src/tclsqlite.c $(TOP)/tool/spaceanal.tcl
sqlite3_analyzer$(TEXE): sqlite3_analyzer.c
$(LTLINK) sqlite3_analyzer.c -o $@ $(LIBTCL) $(TLIBS)
+dbdump$(TEXE): $(TOP)/ext/misc/dbdump.c sqlite3.lo
+ $(LTLINK) -DDBDUMP_STANDALONE -o $@ \
+ $(TOP)/ext/misc/dbdump.c sqlite3.lo $(TLIBS)
+
showdb$(TEXE): $(TOP)/tool/showdb.c sqlite3.lo
$(LTLINK) -o $@ $(TOP)/tool/showdb.c sqlite3.lo $(TLIBS)
@@ -1163,8 +1191,13 @@ LogEst$(TEXE): $(TOP)/tool/logest.c sqlite3.h
wordcount$(TEXE): $(TOP)/test/wordcount.c sqlite3.lo
$(LTLINK) -o $@ $(TOP)/test/wordcount.c sqlite3.lo $(TLIBS)
-speedtest1$(TEXE): $(TOP)/test/speedtest1.c sqlite3.lo
- $(LTLINK) -o $@ $(TOP)/test/speedtest1.c sqlite3.lo $(TLIBS)
+speedtest1$(TEXE): $(TOP)/test/speedtest1.c sqlite3.c
+ $(LTLINK) $(ST_OPT) -o $@ $(TOP)/test/speedtest1.c sqlite3.c $(TLIBS)
+
+KV_OPT += -DSQLITE_DIRECT_OVERFLOW_READ
+
+kvtest$(TEXE): $(TOP)/test/kvtest.c sqlite3.c
+ $(LTLINK) $(KV_OPT) -o $@ $(TOP)/test/kvtest.c sqlite3.c $(TLIBS)
rbu$(EXE): $(TOP)/ext/rbu/rbu.c $(TOP)/ext/rbu/sqlite3rbu.c sqlite3.lo
$(LTLINK) -I. -o $@ $(TOP)/ext/rbu/rbu.c sqlite3.lo $(TLIBS)
diff --git a/Makefile.msc b/Makefile.msc
index 0d42d41..da94288 100644
--- a/Makefile.msc
+++ b/Makefile.msc
@@ -21,7 +21,14 @@ USE_AMALGAMATION = 1
# Set this non-0 to enable full warnings (-W4, etc) when compiling.
#
!IFNDEF USE_FULLWARN
-USE_FULLWARN = 0
+USE_FULLWARN = 1
+!ENDIF
+
+# Set this non-0 to enable treating warnings as errors (-WX, etc) when
+# compiling.
+#
+!IFNDEF USE_FATAL_WARN
+USE_FATAL_WARN = 0
!ENDIF
# Set this non-0 to enable full runtime error checks (-RTC1, etc). This
@@ -493,6 +500,12 @@ TCC = $(CC) -nologo -W4 -DINCLUDE_MSVC_H=1 $(CCOPTS) $(TCCOPTS)
TCC = $(CC) -nologo -W3 $(CCOPTS) $(TCCOPTS)
!ENDIF
+# Check if warnings should be treated as errors when compiling.
+#
+!IF $(USE_FATAL_WARN)!=0
+TCC = $(TCC) -WX
+!ENDIF
+
TCC = $(TCC) -DSQLITE_OS_WIN=1 -I. -I$(TOP) -I$(TOP)\src -fp:precise
RCC = $(RC) -DSQLITE_OS_WIN=1 -I. -I$(TOP) -I$(TOP)\src $(RCOPTS) $(RCCOPTS)
@@ -733,6 +746,10 @@ RCC = $(RCC) -DSQLITE_ENABLE_API_ARMOR=1
!IF $(DEBUG)>2
TCC = $(TCC) -DSQLITE_DEBUG=1
RCC = $(RCC) -DSQLITE_DEBUG=1
+!IF $(DYNAMIC_SHELL)==0
+TCC = $(TCC) -DSQLITE_ENABLE_WHERETRACE -DSQLITE_ENABLE_SELECTTRACE
+RCC = $(RCC) -DSQLITE_ENABLE_WHERETRACE -DSQLITE_ENABLE_SELECTTRACE
+!ENDIF
!ENDIF
!IF $(DEBUG)>4 || $(OSTRACE)!=0
@@ -1277,7 +1294,8 @@ SRC07 = \
$(TOP)\ext\rtree\rtree.c \
$(TOP)\ext\session\sqlite3session.c \
$(TOP)\ext\rbu\sqlite3rbu.c \
- $(TOP)\ext\misc\json1.c
+ $(TOP)\ext\misc\json1.c \
+ $(TOP)\ext\misc\stmt.c
# Extension header files, part 1.
#
@@ -1396,9 +1414,11 @@ TESTEXT = \
$(TOP)\ext\misc\nextchar.c \
$(TOP)\ext\misc\percentile.c \
$(TOP)\ext\misc\regexp.c \
+ $(TOP)\ext\misc\remember.c \
$(TOP)\ext\misc\series.c \
$(TOP)\ext\misc\spellfix.c \
$(TOP)\ext\misc\totype.c \
+ $(TOP)\ext\misc\unionvtab.c \
$(TOP)\ext\misc\wholenumber.c
# Source code to the library files needed by the test fixture
@@ -1479,14 +1499,15 @@ FUZZDATA = \
$(TOP)\test\fuzzdata1.db \
$(TOP)\test\fuzzdata2.db \
$(TOP)\test\fuzzdata3.db \
- $(TOP)\test\fuzzdata4.db
+ $(TOP)\test\fuzzdata4.db \
+ $(TOP)\test\fuzzdata5.db
# <>
# Additional compiler options for the shell. These are only effective
# when the shell is not being dynamically linked.
#
!IF $(DYNAMIC_SHELL)==0 && $(FOR_WIN10)==0
-SHELL_COMPILE_OPTS = $(SHELL_COMPILE_OPTS) -DSQLITE_SHELL_JSON1 -DSQLITE_ENABLE_FTS4 -DSQLITE_ENABLE_EXPLAIN_COMMENTS
+SHELL_COMPILE_OPTS = $(SHELL_COMPILE_OPTS) -DSQLITE_SHELL_JSON1 -DSQLITE_ENABLE_FTS4 -DSQLITE_ENABLE_EXPLAIN_COMMENTS -DSQLITE_ENABLE_STMTVTAB
!ENDIF
# <>
@@ -1494,7 +1515,13 @@ SHELL_COMPILE_OPTS = $(SHELL_COMPILE_OPTS) -DSQLITE_SHELL_JSON1 -DSQLITE_ENABLE_
#
MPTESTER_COMPILE_OPTS = -DSQLITE_SHELL_JSON1 -DSQLITE_ENABLE_FTS5
FUZZERSHELL_COMPILE_OPTS = -DSQLITE_ENABLE_JSON1
-FUZZCHECK_COMPILE_OPTS = -DSQLITE_ENABLE_JSON1 -DSQLITE_ENABLE_MEMSYS5
+FUZZCHECK_COMPILE_OPTS = -DSQLITE_ENABLE_JSON1 -DSQLITE_ENABLE_MEMSYS5 -DSQLITE_OSS_FUZZ -DSQLITE_MAX_MEMORY=50000000
+FUZZCHECK_SRC = $(TOP)\test\fuzzcheck.c $(TOP)\test\ossfuzz.c
+OSSSHELL_SRC = $(TOP)\test\ossshell.c $(TOP)\test\ossfuzz.c
+DBFUZZ_COMPILE_OPTS = -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION
+KV_COMPILE_OPTS = -DSQLITE_THREADSAFE=0 -DSQLITE_DIRECT_OVERFLOW_READ
+DBSELFTEST_COMPILE_OPTS = -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION -DSQLITE_ENABLE_RTREE -DSQLITE_ENABLE_FTS4 -DSQLITE_ENABLE_FTS5
+ST_COMPILE_OPTS = -DSQLITE_THREADSAFE=0
# Standard options to testfixture.
#
@@ -1537,7 +1564,7 @@ $(SQLITE3DLL): $(LIBOBJ) $(LIBRESOBJS) $(CORE_LINK_DEP)
sqlite3.def: libsqlite3.lib
echo EXPORTS > sqlite3.def
dumpbin /all libsqlite3.lib \
- | $(TCLSH_CMD) $(TOP)\tool\replace.tcl include "^\s+1 _?(sqlite3_[^@]*)(?:@\d+)?$$" \1 \
+ | $(TCLSH_CMD) $(TOP)\tool\replace.tcl include "^\s+1 _?(sqlite3(?:session|changeset|changegroup)?_[^@]*)(?:@\d+)?$$" \1 \
| sort >> sqlite3.def
# <>
@@ -1564,8 +1591,14 @@ sourcetest: srcck1.exe sqlite3.c
fuzzershell.exe: $(TOP)\tool\fuzzershell.c $(SQLITE3C) $(SQLITE3H)
$(LTLINK) $(NO_WARN) $(FUZZERSHELL_COMPILE_OPTS) $(TOP)\tool\fuzzershell.c $(SQLITE3C) /link $(LDFLAGS) $(LTLINKOPTS)
-fuzzcheck.exe: $(TOP)\test\fuzzcheck.c $(SQLITE3C) $(SQLITE3H)
- $(LTLINK) $(NO_WARN) $(FUZZCHECK_COMPILE_OPTS) $(TOP)\test\fuzzcheck.c $(SQLITE3C) /link $(LDFLAGS) $(LTLINKOPTS)
+dbfuzz.exe: $(TOP)\test\dbfuzz.c $(SQLITE3C) $(SQLITE3H)
+ $(LTLINK) $(NO_WARN) $(DBFUZZ_COMPILE_OPTS) $(TOP)\test\dbfuzz.c $(SQLITE3C) /link $(LDFLAGS) $(LTLINKOPTS)
+
+fuzzcheck.exe: $(FUZZCHECK_SRC) $(SQLITE3C) $(SQLITE3H)
+ $(LTLINK) $(NO_WARN) $(FUZZCHECK_COMPILE_OPTS) $(FUZZCHECK_SRC) $(SQLITE3C) /link $(LDFLAGS) $(LTLINKOPTS)
+
+ossshell.exe: $(OSSSHELL_SRC) $(SQLITE3C) $(SQLITE3H)
+ $(LTLINK) $(NO_WARN) $(FUZZCHECK_COMPILE_OPTS) $(OSSSHELL_SRC) $(SQLITE3C) /link $(LDFLAGS) $(LTLINKOPTS)
mptester.exe: $(TOP)\mptest\mptest.c $(SQLITE3C) $(SQLITE3H)
$(LTLINK) $(NO_WARN) $(MPTESTER_COMPILE_OPTS) $(TOP)\mptest\mptest.c $(SQLITE3C) /link $(LDFLAGS) $(LTLINKOPTS)
@@ -2058,6 +2091,7 @@ TESTFIXTURE_FLAGS = $(TESTFIXTURE_FLAGS) -DSQLITE_SERVER=1 -DSQLITE_PRIVATE=""
TESTFIXTURE_FLAGS = $(TESTFIXTURE_FLAGS) -DSQLITE_CORE $(NO_WARN)
TESTFIXTURE_FLAGS = $(TESTFIXTURE_FLAGS) -DSQLITE_SERIES_CONSTRAINT_VERIFY=1
TESTFIXTURE_FLAGS = $(TESTFIXTURE_FLAGS) -DSQLITE_DEFAULT_PAGE_SIZE=1024
+TESTFIXTURE_FLAGS = $(TESTFIXTURE_FLAGS) -DSQLITE_ENABLE_STMTVTAB
TESTFIXTURE_FLAGS = $(TESTFIXTURE_FLAGS) $(TEST_CCONV_OPTS)
TESTFIXTURE_SRC0 = $(TESTEXT) $(TESTSRC2)
@@ -2150,6 +2184,10 @@ sqlite3_analyzer.exe: sqlite3_analyzer.c $(LIBRESOBJS)
$(LTLINK) $(NO_WARN) -DBUILD_sqlite -I$(TCLINCDIR) sqlite3_analyzer.c \
/link $(LDFLAGS) $(LTLINKOPTS) $(LTLIBPATHS) $(LIBRESOBJS) $(LTLIBS) $(TLIBS)
+dbdump.exe: $(TOP)\ext\misc\dbdump.c $(SQLITE3C) $(SQLITE3H)
+ $(LTLINK) $(NO_WARN) -DDBDUMP_STANDALONE $(TOP)\ext\misc\dbdump.c $(SQLITE3C) \
+ /link $(LDFLAGS) $(LTLINKOPTS) $(LTLIBPATHS) $(LIBRESOBJS) $(LTLIBS)
+
testloadext.lo: $(TOP)\src\test_loadext.c
$(LTCOMPILE) $(NO_WARN) -c $(TOP)\src\test_loadext.c
@@ -2157,48 +2195,59 @@ testloadext.dll: testloadext.lo
$(LD) $(LDFLAGS) $(LTLINKOPTS) $(LTLIBPATHS) /DLL /OUT:$@ testloadext.lo
showdb.exe: $(TOP)\tool\showdb.c $(SQLITE3C) $(SQLITE3H)
- $(LTLINK) $(NO_WARN) -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION -Fe$@ \
+ $(LTLINK) $(NO_WARN) -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION \
$(TOP)\tool\showdb.c $(SQLITE3C) /link $(LDFLAGS) $(LTLINKOPTS)
showstat4.exe: $(TOP)\tool\showstat4.c $(SQLITE3C) $(SQLITE3H)
- $(LTLINK) $(NO_WARN) -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION -Fe$@ \
+ $(LTLINK) $(NO_WARN) -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION \
$(TOP)\tool\showstat4.c $(SQLITE3C) /link $(LDFLAGS) $(LTLINKOPTS)
showjournal.exe: $(TOP)\tool\showjournal.c $(SQLITE3C) $(SQLITE3H)
- $(LTLINK) $(NO_WARN) -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION -Fe$@ \
+ $(LTLINK) $(NO_WARN) -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION \
$(TOP)\tool\showjournal.c $(SQLITE3C) /link $(LDFLAGS) $(LTLINKOPTS)
showwal.exe: $(TOP)\tool\showwal.c $(SQLITE3C) $(SQLITE3H)
- $(LTLINK) $(NO_WARN) -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION -Fe$@ \
+ $(LTLINK) $(NO_WARN) -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION \
$(TOP)\tool\showwal.c $(SQLITE3C) /link $(LDFLAGS) $(LTLINKOPTS)
-changeset.exe: $(TOP)\ext\session\changeset.c $(SQLITE3C)
- $(LTLINK) $(NO_WARN) -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION -Fe$@ \
+changeset.exe: $(TOP)\ext\session\changeset.c $(SQLITE3C) $(SQLITE3H)
+ $(LTLINK) $(NO_WARN) -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION \
+ -DSQLITE_ENABLE_SESSION=1 -DSQLITE_ENABLE_PREUPDATE_HOOK=1 \
$(TOP)\ext\session\changeset.c $(SQLITE3C) /link $(LDFLAGS) $(LTLINKOPTS)
fts3view.exe: $(TOP)\ext\fts3\tool\fts3view.c $(SQLITE3C) $(SQLITE3H)
- $(LTLINK) $(NO_WARN) -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION -Fe$@ \
+ $(LTLINK) $(NO_WARN) -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION \
$(TOP)\ext\fts3\tool\fts3view.c $(SQLITE3C) /link $(LDFLAGS) $(LTLINKOPTS)
rollback-test.exe: $(TOP)\tool\rollback-test.c $(SQLITE3C) $(SQLITE3H)
- $(LTLINK) $(NO_WARN) -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION -Fe$@ \
+ $(LTLINK) $(NO_WARN) -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION \
$(TOP)\tool\rollback-test.c $(SQLITE3C) /link $(LDFLAGS) $(LTLINKOPTS)
LogEst.exe: $(TOP)\tool\logest.c $(SQLITE3H)
- $(LTLINK) $(NO_WARN) -Fe$@ $(TOP)\tool\LogEst.c /link $(LDFLAGS) $(LTLINKOPTS)
+ $(LTLINK) $(NO_WARN) $(TOP)\tool\LogEst.c /link $(LDFLAGS) $(LTLINKOPTS)
wordcount.exe: $(TOP)\test\wordcount.c $(SQLITE3C) $(SQLITE3H)
- $(LTLINK) $(NO_WARN) -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION -Fe$@ \
+ $(LTLINK) $(NO_WARN) -DSQLITE_THREADSAFE=0 -DSQLITE_OMIT_LOAD_EXTENSION \
$(TOP)\test\wordcount.c $(SQLITE3C) /link $(LDFLAGS) $(LTLINKOPTS)
speedtest1.exe: $(TOP)\test\speedtest1.c $(SQLITE3C) $(SQLITE3H)
- $(LTLINK) $(NO_WARN) -DSQLITE_OMIT_LOAD_EXTENSION -Fe$@ \
+ $(LTLINK) $(NO_WARN) $(ST_COMPILE_OPTS) -DSQLITE_OMIT_LOAD_EXTENSION \
$(TOP)\test\speedtest1.c $(SQLITE3C) /link $(LDFLAGS) $(LTLINKOPTS)
+kvtest.exe: $(TOP)\test\kvtest.c $(SQLITE3C) $(SQLITE3H)
+ $(LTLINK) $(NO_WARN) $(KV_COMPILE_OPTS) \
+ $(TOP)\test\kvtest.c $(SQLITE3C) /link $(LDFLAGS) $(LTLINKOPTS)
+
+dbselftest.exe: $(TOP)\test\dbselftest.c $(SQLITE3C) $(SQLITE3H)
+ $(LTLINK) $(NO_WARN) $(DBSELFTEST_COMPILE_OPTS) $(TOP)\test\dbselftest.c $(SQLITE3C)
+
rbu.exe: $(TOP)\ext\rbu\rbu.c $(TOP)\ext\rbu\sqlite3rbu.c $(SQLITE3C) $(SQLITE3H)
- $(LTLINK) $(NO_WARN) -DSQLITE_ENABLE_RBU -Fe$@ \
+ $(LTLINK) $(NO_WARN) -DSQLITE_ENABLE_RBU \
$(TOP)\ext\rbu\rbu.c $(SQLITE3C) /link $(LDFLAGS) $(LTLINKOPTS)
+LSMDIR=$(TOP)\ext\lsm1
+!INCLUDE $(LSMDIR)\Makefile.msc
+
moreclean: clean
del /Q $(SQLITE3C) $(SQLITE3H) 2>NUL
# <>
@@ -2218,9 +2267,10 @@ clean:
-rmdir /Q/S tsrc 2>NUL
del /Q .target_source 2>NUL
del /Q tclsqlite3.exe $(SQLITETCLH) $(SQLITETCLDECLSH) 2>NUL
+ del /Q lsm.dll lsmtest.exe 2>NUL
del /Q testloadext.dll 2>NUL
del /Q testfixture.exe test.db 2>NUL
- del /Q LogEst.exe fts3view.exe rollback-test.exe showdb.exe 2>NUL
+ del /Q LogEst.exe fts3view.exe rollback-test.exe showdb.exe dbdump.exe 2>NUL
del /Q changeset.exe 2>NUL
del /Q showjournal.exe showstat4.exe showwal.exe speedtest1.exe 2>NUL
del /Q mptester.exe wordcount.exe rbu.exe srcck1.exe 2>NUL
diff --git a/README.md b/README.md
index dbc0205..a7b8701 100644
--- a/README.md
+++ b/README.md
@@ -8,6 +8,50 @@ If you are reading this on a Git mirror someplace, you are doing it wrong.
The [official repository](https://www.sqlite.org/src/) is better. Go there
now.
+## Obtaining The Code
+
+SQLite sources are managed using the
+[Fossil](https://www.fossil-scm.org/), a distributed version control system
+that was specifically designed to support SQLite development.
+If you do not want to use Fossil, you can download tarballs or ZIP
+archives as follows:
+
+ * Lastest trunk check-in:
+ or
+ .
+
+ * Latest release:
+ or
+ .
+
+ * For other check-ins, substitute an appropriate branch name or
+ tag or hash prefix for "release" in the URLs of the previous
+ bullet. Or browse the [timeline](https://www.sqlite.org/src/timeline)
+ to locate the check-in desired, click on its information page link,
+ then click on the "Tarball" or "ZIP Archive" links on the information
+ page.
+
+If you do want to use Fossil to check out the source tree,
+first install Fossil version 2.0 or later.
+(Source tarballs and precompiled binaries available
+[here](https://www.fossil-scm.org/fossil/uv/download.html). Fossil is
+a stand-alone program. To install, simply download or build the single
+executable file and put that file someplace on your $PATH.)
+Then run commands like this:
+
+ mkdir ~/sqlite
+ cd ~/sqlite
+ fossil clone https://www.sqlite.org/src sqlite.fossil
+ fossil open sqlite.fossil
+
+After setting up a repository using the steps above, you can always
+update to the lastest version using:
+
+ fossil update trunk ;# latest trunk check-in
+ fossil update release ;# latest official release
+
+Or type "fossil ui" to get a web-based user interface.
+
## Compiling
First create a directory in which to place
@@ -18,13 +62,13 @@ script found at the root of the source tree. Then run "make".
For example:
- tar xzf sqlite.tar.gz ;# Unpack the source tree into "sqlite"
- mkdir bld ;# Build will occur in a sibling directory
- cd bld ;# Change to the build directory
- ../sqlite/configure ;# Run the configure script
- make ;# Run the makefile.
- make sqlite3.c ;# Build the "amalgamation" source file
- make test ;# Run some tests (requires Tcl)
+ tar xzf sqlite.tar.gz ;# Unpack the source tree into "sqlite"
+ mkdir bld ;# Build will occur in a sibling directory
+ cd bld ;# Change to the build directory
+ ../sqlite/configure ;# Run the configure script
+ make ;# Run the makefile.
+ make sqlite3.c ;# Build the "amalgamation" source file
+ make test ;# Run some tests (requires Tcl)
See the makefile for additional targets.
@@ -43,13 +87,13 @@ with the provided "Makefile.msc" to build one of the supported targets.
For example:
- mkdir bld
- cd bld
- nmake /f Makefile.msc TOP=..\sqlite
- nmake /f Makefile.msc sqlite3.c TOP=..\sqlite
- nmake /f Makefile.msc sqlite3.dll TOP=..\sqlite
- nmake /f Makefile.msc sqlite3.exe TOP=..\sqlite
- nmake /f Makefile.msc test TOP=..\sqlite
+ mkdir bld
+ cd bld
+ nmake /f Makefile.msc TOP=..\sqlite
+ nmake /f Makefile.msc sqlite3.c TOP=..\sqlite
+ nmake /f Makefile.msc sqlite3.dll TOP=..\sqlite
+ nmake /f Makefile.msc sqlite3.exe TOP=..\sqlite
+ nmake /f Makefile.msc test TOP=..\sqlite
There are several build options that can be set via the NMAKE command
line. For example, to build for WinRT, simply add "FOR_WINRT=1" argument
@@ -64,19 +108,22 @@ The makefiles also require AWK.
## Source Code Tour
-Most of the core source files are in the **src/** subdirectory. But
-src/ also contains files used to build the "testfixture" test harness;
-those file all begin with "test". And src/ contains the "shell.c" file
-which is the main program for the "sqlite3.exe" command-line shell and
-the "tclsqlite.c" file which implements the bindings to SQLite from the
-Tcl programming language. (Historical note: SQLite began as a Tcl
+Most of the core source files are in the **src/** subdirectory. The
+**src/** folder also contains files used to build the "testfixture" test
+harness. The names of the source files used by "testfixture" all begin
+with "test".
+The **src/** also contains the "shell.c" file
+which is the main program for the "sqlite3.exe"
+[command-line shell](https://sqlite.org/cli.html) and
+the "tclsqlite.c" file which implements the
+[TCL bindings](https://sqlite.org/tclsqlite.html) for SQLite.
+(Historical note: SQLite began as a Tcl
extension and only later escaped to the wild as an independent library.)
Test scripts and programs are found in the **test/** subdirectory.
-There are other test suites for SQLite (see
-[How SQLite Is Tested](http://www.sqlite.org/testing.html))
-but those other test suites are
-in separate source repositories.
+Addtional test code is found in other source repositories.
+See [How SQLite Is Tested](http://www.sqlite.org/testing.html) for
+additional information.
The **ext/** subdirectory contains code for extensions. The
Full-text search engine is in **ext/fts3**. The R-Tree engine is in
@@ -100,7 +147,7 @@ manually-edited files and automatically-generated files.
The SQLite interface is defined by the **sqlite3.h** header file, which is
generated from src/sqlite.h.in, ./manifest.uuid, and ./VERSION. The
[Tcl script](http://www.tcl.tk) at tool/mksqlite3h.tcl does the conversion.
-The manifest.uuid file contains the SHA1 hash of the particular check-in
+The manifest.uuid file contains the SHA3 hash of the particular check-in
and is used to generate the SQLITE\_SOURCE\_ID macro. The VERSION file
contains the current SQLite version number. The sqlite3.h header is really
just a copy of src/sqlite.h.in with the source-id and version number inserted
@@ -111,9 +158,8 @@ used to generate that documentation are in a separate source repository.
The SQL language parser is **parse.c** which is generate from a grammar in
the src/parse.y file. The conversion of "parse.y" into "parse.c" is done
by the [lemon](./doc/lemon.html) LALR(1) parser generator. The source code
-for lemon is at tool/lemon.c. Lemon uses a
-template for generating its parser. A generic template is in tool/lempar.c,
-but SQLite uses a slightly modified template found in src/lempar.c.
+for lemon is at tool/lemon.c. Lemon uses the tool/lempar.c file as a
+template for generating its parser.
Lemon also generates the **parse.h** header file, at the same time it
generates parse.c. But the parse.h header file is
@@ -133,6 +179,13 @@ that maps SQL language keywords (ex: "CREATE", "SELECT", "INDEX", etc.) into
the numeric codes used by the parse.c parser. The keywordhash.h file is
generated by a C-language program at tool mkkeywordhash.c.
+The **pragma.h** header file contains various definitions used to parse
+and implement the PRAGMA statements. The header is generated by a
+script **tool/mkpragmatab.tcl**. If you want to add a new PRAGMA, edit
+the **tool/mkpragmatab.tcl** file to insert the information needed by the
+parser for your new PRAGMA, then run the script to regenerate the
+**pragma.h** header file.
+
### The Amalgamation
All of the individual C source code and header files (both manually-edited
@@ -150,7 +203,7 @@ subdirectory (using the equivalent of "make target_source") then the
tool/mksqlite3c.tcl script is run to copy them all together in just the
right order while resolving internal "#include" references.
-The amalgamation source file is more than 100K lines long. Some symbolic
+The amalgamation source file is more than 200K lines long. Some symbolic
debuggers (most notably MSVC) are unable to deal with files longer than 64K
lines. To work around this, a separate Tcl script, tool/split-sqlite3c.tcl,
can be run on the amalgamation to break it up into a single small C file
@@ -167,14 +220,15 @@ See the [architectural description](http://www.sqlite.org/arch.html)
for details. Other documents that are useful in
(helping to understand how SQLite works include the
[file format](http://www.sqlite.org/fileformat2.html) description,
-the [virtual machine](http://www.sqlite.org/vdbe.html) that runs
+the [virtual machine](http://www.sqlite.org/opcode.html) that runs
prepared statements, the description of
[how transactions work](http://www.sqlite.org/atomiccommit.html), and
the [overview of the query planner](http://www.sqlite.org/optoverview.html).
-Unfortunately, years of effort have gone into optimizating SQLite, both
+Years of effort have gone into optimizating SQLite, both
for small size and high performance. And optimizations tend to result in
-complex code. So there is a lot of complexity in the SQLite implementation.
+complex code. So there is a lot of complexity in the current SQLite
+implementation. It will not be the easiest library in the world to hack.
Key files:
@@ -185,7 +239,7 @@ Key files:
* **sqliteInt.h** - this header file defines many of the data objects
used internally by SQLite.
- * **parse.y** - This file describes the LALR(1) grammer that SQLite uses
+ * **parse.y** - This file describes the LALR(1) grammar that SQLite uses
to parse SQL statements, and the actions that are taken at each step
in the parsing process.
@@ -218,13 +272,13 @@ Key files:
is not part of the core SQLite library. But as most of the tests in this
repository are written in Tcl, the Tcl language bindings are important.
-There are many other source files. Each has a suscinct header comment that
+There are many other source files. Each has a succinct header comment that
describes its purpose and role within the larger system.
## Contacts
The main SQLite webpage is [http://www.sqlite.org/](http://www.sqlite.org/)
-with geographically distributed backup servers at
+with geographically distributed backups at
[http://www2.sqlite.org/](http://www2.sqlite.org) and
[http://www3.sqlite.org/](http://www3.sqlite.org).
diff --git a/VERSION b/VERSION
index 861845e..076e9c7 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-3.15.2
+3.20.1
diff --git a/autoconf/Makefile.msc b/autoconf/Makefile.msc
index b53e237..f0f9a01 100644
--- a/autoconf/Makefile.msc
+++ b/autoconf/Makefile.msc
@@ -21,7 +21,14 @@ TOP = .
# Set this non-0 to enable full warnings (-W4, etc) when compiling.
#
!IFNDEF USE_FULLWARN
-USE_FULLWARN = 0
+USE_FULLWARN = 1
+!ENDIF
+
+# Set this non-0 to enable treating warnings as errors (-WX, etc) when
+# compiling.
+#
+!IFNDEF USE_FATAL_WARN
+USE_FATAL_WARN = 0
!ENDIF
# Set this non-0 to enable full runtime error checks (-RTC1, etc). This
@@ -454,6 +461,12 @@ TCC = $(CC) -nologo -W4 -DINCLUDE_MSVC_H=1 $(CCOPTS) $(TCCOPTS)
TCC = $(CC) -nologo -W3 $(CCOPTS) $(TCCOPTS)
!ENDIF
+# Check if warnings should be treated as errors when compiling.
+#
+!IF $(USE_FATAL_WARN)!=0
+TCC = $(TCC) -WX
+!ENDIF
+
TCC = $(TCC) -DSQLITE_OS_WIN=1 -I. -I$(TOP) -fp:precise
RCC = $(RC) -DSQLITE_OS_WIN=1 -I. -I$(TOP) $(RCOPTS) $(RCCOPTS)
@@ -632,6 +645,10 @@ RCC = $(RCC) -DSQLITE_ENABLE_API_ARMOR=1
!IF $(DEBUG)>2
TCC = $(TCC) -DSQLITE_DEBUG=1
RCC = $(RCC) -DSQLITE_DEBUG=1
+!IF $(DYNAMIC_SHELL)==0
+TCC = $(TCC) -DSQLITE_ENABLE_WHERETRACE -DSQLITE_ENABLE_SELECTTRACE
+RCC = $(RCC) -DSQLITE_ENABLE_WHERETRACE -DSQLITE_ENABLE_SELECTTRACE
+!ENDIF
!ENDIF
!IF $(DEBUG)>4 || $(OSTRACE)!=0
@@ -910,7 +927,7 @@ LIBRESOBJS =
# when the shell is not being dynamically linked.
#
!IF $(DYNAMIC_SHELL)==0 && $(FOR_WIN10)==0
-SHELL_COMPILE_OPTS = $(SHELL_COMPILE_OPTS) -DSQLITE_SHELL_JSON1 -DSQLITE_ENABLE_FTS4 -DSQLITE_ENABLE_EXPLAIN_COMMENTS
+SHELL_COMPILE_OPTS = $(SHELL_COMPILE_OPTS) -DSQLITE_SHELL_JSON1 -DSQLITE_ENABLE_FTS4 -DSQLITE_ENABLE_EXPLAIN_COMMENTS -DSQLITE_ENABLE_STMTVTAB
!ENDIF
@@ -937,7 +954,7 @@ Replace.exe:
sqlite3.def: Replace.exe $(LIBOBJ)
echo EXPORTS > sqlite3.def
dumpbin /all $(LIBOBJ) \
- | .\Replace.exe "^\s+/EXPORT:_?(sqlite3_[^@,]*)(?:@\d+|,DATA)?$$" $$1 true \
+ | .\Replace.exe "^\s+/EXPORT:_?(sqlite3(?:session|changeset|changegroup)?_[^@,]*)(?:@\d+|,DATA)?$$" $$1 true \
| sort >> sqlite3.def
$(SQLITE3EXE): $(TOP)\shell.c $(SHELL_CORE_DEP) $(LIBRESOBJS) $(SHELL_CORE_SRC) $(SQLITE3H)
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index b9a11aa..5a607de 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -55,9 +55,9 @@ AS_IF([ test x"$enable_editline" != xno ],[
LIBS=""
AC_SEARCH_LIBS([readline],[edit],[
AC_DEFINE([HAVE_EDITLINE],1,Define to use BSD editline)
- READLINE_LIBS=$LIBS
+ READLINE_LIBS="$LIBS -ltinfo"
enable_readline=no
- ])
+ ],[],[-ltinfo])
AS_UNSET(ac_cv_search_readline)
LIBS=$sLIBS
])
diff --git a/configure b/configure
index e22c4dc..c9b4073 100755
--- a/configure
+++ b/configure
@@ -1,6 +1,6 @@
#! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
-# Generated by GNU Autoconf 2.69 for sqlite 3.15.2.
+# Generated by GNU Autoconf 2.69 for sqlite 3.20.1.
#
#
# Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc.
@@ -726,8 +726,8 @@ MAKEFLAGS=
# Identity of this package.
PACKAGE_NAME='sqlite'
PACKAGE_TARNAME='sqlite'
-PACKAGE_VERSION='3.15.2'
-PACKAGE_STRING='sqlite 3.15.2'
+PACKAGE_VERSION='3.20.1'
+PACKAGE_STRING='sqlite 3.20.1'
PACKAGE_BUGREPORT=''
PACKAGE_URL=''
@@ -1463,7 +1463,7 @@ if test "$ac_init_help" = "long"; then
# Omit some internal or obsolete options to make the list less imposing.
# This message is too long to be a string in the A/UX 3.1 sh.
cat <<_ACEOF
-\`configure' configures sqlite 3.15.2 to adapt to many kinds of systems.
+\`configure' configures sqlite 3.20.1 to adapt to many kinds of systems.
Usage: $0 [OPTION]... [VAR=VALUE]...
@@ -1528,7 +1528,7 @@ fi
if test -n "$ac_init_help"; then
case $ac_init_help in
- short | recursive ) echo "Configuration of sqlite 3.15.2:";;
+ short | recursive ) echo "Configuration of sqlite 3.20.1:";;
esac
cat <<\_ACEOF
@@ -1652,7 +1652,7 @@ fi
test -n "$ac_init_help" && exit $ac_status
if $ac_init_version; then
cat <<\_ACEOF
-sqlite configure 3.15.2
+sqlite configure 3.20.1
generated by GNU Autoconf 2.69
Copyright (C) 2012 Free Software Foundation, Inc.
@@ -2071,7 +2071,7 @@ cat >config.log <<_ACEOF
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
-It was created by sqlite $as_me 3.15.2, which was
+It was created by sqlite $as_me 3.20.1, which was
generated by GNU Autoconf 2.69. Invocation command line was
$ $0 $@
@@ -11252,7 +11252,7 @@ else
fi
if test "${use_debug}" = "yes" ; then
- TARGET_DEBUG="-DSQLITE_DEBUG=1"
+ TARGET_DEBUG="-DSQLITE_DEBUG=1 -DSQLITE_ENABLE_SELECTTRACE -DSQLITE_ENABLE_WHERETRACE -O0"
else
TARGET_DEBUG="-DNDEBUG"
fi
@@ -11356,7 +11356,7 @@ fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to support MEMSYS5" >&5
$as_echo_n "checking whether to support MEMSYS5... " >&6; }
if test "${enable_memsys5}" = "yes"; then
- OPT_FEATURE_FLAGS+=" -DSQLITE_ENABLE_MEMSYS5"
+ OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_MEMSYS5"
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
$as_echo "yes" >&6; }
else
@@ -11373,7 +11373,7 @@ fi
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking whether to support MEMSYS3" >&5
$as_echo_n "checking whether to support MEMSYS3... " >&6; }
if test "${enable_memsys3}" = "yes" -a "${enable_memsys5}" = "no"; then
- OPT_FEATURE_FLAGS+=" -DSQLITE_ENABLE_MEMSYS3"
+ OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_MEMSYS3"
{ $as_echo "$as_me:${as_lineno-$LINENO}: result: yes" >&5
$as_echo "yes" >&6; }
else
@@ -11391,7 +11391,7 @@ else
fi
if test "${enable_fts3}" = "yes" ; then
- OPT_FEATURE_FLAGS+=" -DSQLITE_ENABLE_FTS3"
+ OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_FTS3"
fi
# Check whether --enable-fts4 was given.
if test "${enable_fts4+set}" = set; then :
@@ -11401,7 +11401,7 @@ else
fi
if test "${enable_fts4}" = "yes" ; then
- OPT_FEATURE_FLAGS+=" -DSQLITE_ENABLE_FTS4"
+ OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_FTS4"
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing log" >&5
$as_echo_n "checking for library containing log... " >&6; }
if ${ac_cv_search_log+:} false; then :
@@ -11467,7 +11467,7 @@ else
fi
if test "${enable_fts5}" = "yes" ; then
- OPT_FEATURE_FLAGS+=" -DSQLITE_ENABLE_FTS5"
+ OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_FTS5"
{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for library containing log" >&5
$as_echo_n "checking for library containing log... " >&6; }
if ${ac_cv_search_log+:} false; then :
@@ -11536,7 +11536,7 @@ else
fi
if test "${enable_json1}" = "yes" ; then
- OPT_FEATURE_FLAGS+=" -DSQLITE_ENABLE_JSON1"
+ OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_JSON1"
fi
#########
@@ -11549,7 +11549,7 @@ else
fi
if test "${enable_rtree}" = "yes" ; then
- OPT_FEATURE_FLAGS+=" -DSQLITE_ENABLE_RTREE"
+ OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_RTREE"
fi
#########
@@ -11562,12 +11562,12 @@ else
fi
if test "${enable_session}" = "yes" ; then
- OPT_FEATURE_FLAGS+=" -DSQLITE_ENABLE_SESSION"
- OPT_FEATURE_FLAGS+=" -DSQLITE_ENABLE_PREUPDATE_HOOK"
+ OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_SESSION"
+ OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_PREUPDATE_HOOK"
fi
#########
-# attempt to duplicate any OMITS and ENABLES into the $(OPT_FEATURE_FLAGS) parameter
+# attempt to duplicate any OMITS and ENABLES into the ${OPT_FEATURE_FLAGS} parameter
for option in $CFLAGS $CPPFLAGS
do
case $option in
@@ -12151,7 +12151,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1
# report actual input values of CONFIG_FILES etc. instead of their
# values after options handling.
ac_log="
-This file was extended by sqlite $as_me 3.15.2, which was
+This file was extended by sqlite $as_me 3.20.1, which was
generated by GNU Autoconf 2.69. Invocation command line was
CONFIG_FILES = $CONFIG_FILES
@@ -12217,7 +12217,7 @@ _ACEOF
cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1
ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`"
ac_cs_version="\\
-sqlite config.status 3.15.2
+sqlite config.status 3.20.1
configured by $0, generated by GNU Autoconf 2.69,
with options \\"\$ac_cs_config\\"
diff --git a/configure.ac b/configure.ac
index c9e3af9..4deee8d 100644
--- a/configure.ac
+++ b/configure.ac
@@ -560,7 +560,7 @@ AC_SEARCH_LIBS(fdatasync, [rt])
AC_ARG_ENABLE(debug, AC_HELP_STRING([--enable-debug],[enable debugging & verbose explain]),
[use_debug=$enableval],[use_debug=no])
if test "${use_debug}" = "yes" ; then
- TARGET_DEBUG="-DSQLITE_DEBUG=1"
+ TARGET_DEBUG="-DSQLITE_DEBUG=1 -DSQLITE_ENABLE_SELECTTRACE -DSQLITE_ENABLE_WHERETRACE -O0"
else
TARGET_DEBUG="-DNDEBUG"
fi
@@ -596,7 +596,7 @@ AC_ARG_ENABLE(memsys5,
[enable_memsys5=yes],[enable_memsys5=no])
AC_MSG_CHECKING([whether to support MEMSYS5])
if test "${enable_memsys5}" = "yes"; then
- OPT_FEATURE_FLAGS+=" -DSQLITE_ENABLE_MEMSYS5"
+ OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_MEMSYS5"
AC_MSG_RESULT([yes])
else
AC_MSG_RESULT([no])
@@ -606,7 +606,7 @@ AC_ARG_ENABLE(memsys3,
[enable_memsys3=yes],[enable_memsys3=no])
AC_MSG_CHECKING([whether to support MEMSYS3])
if test "${enable_memsys3}" = "yes" -a "${enable_memsys5}" = "no"; then
- OPT_FEATURE_FLAGS+=" -DSQLITE_ENABLE_MEMSYS3"
+ OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_MEMSYS3"
AC_MSG_RESULT([yes])
else
AC_MSG_RESULT([no])
@@ -618,20 +618,20 @@ AC_ARG_ENABLE(fts3, AC_HELP_STRING([--enable-fts3],
[Enable the FTS3 extension]),
[enable_fts3=yes],[enable_fts3=no])
if test "${enable_fts3}" = "yes" ; then
- OPT_FEATURE_FLAGS+=" -DSQLITE_ENABLE_FTS3"
+ OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_FTS3"
fi
AC_ARG_ENABLE(fts4, AC_HELP_STRING([--enable-fts4],
[Enable the FTS4 extension]),
[enable_fts4=yes],[enable_fts4=no])
if test "${enable_fts4}" = "yes" ; then
- OPT_FEATURE_FLAGS+=" -DSQLITE_ENABLE_FTS4"
+ OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_FTS4"
AC_SEARCH_LIBS([log],[m])
fi
AC_ARG_ENABLE(fts5, AC_HELP_STRING([--enable-fts5],
[Enable the FTS5 extension]),
[enable_fts5=yes],[enable_fts5=no])
if test "${enable_fts5}" = "yes" ; then
- OPT_FEATURE_FLAGS+=" -DSQLITE_ENABLE_FTS5"
+ OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_FTS5"
AC_SEARCH_LIBS([log],[m])
fi
@@ -641,7 +641,7 @@ AC_ARG_ENABLE(json1, AC_HELP_STRING([--enable-json1],
[Enable the JSON1 extension]),
[enable_json1=yes],[enable_json1=no])
if test "${enable_json1}" = "yes" ; then
- OPT_FEATURE_FLAGS+=" -DSQLITE_ENABLE_JSON1"
+ OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_JSON1"
fi
#########
@@ -650,7 +650,7 @@ AC_ARG_ENABLE(rtree, AC_HELP_STRING([--enable-rtree],
[Enable the RTREE extension]),
[enable_rtree=yes],[enable_rtree=no])
if test "${enable_rtree}" = "yes" ; then
- OPT_FEATURE_FLAGS+=" -DSQLITE_ENABLE_RTREE"
+ OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_RTREE"
fi
#########
@@ -659,12 +659,12 @@ AC_ARG_ENABLE(session, AC_HELP_STRING([--enable-session],
[Enable the SESSION extension]),
[enable_session=yes],[enable_session=no])
if test "${enable_session}" = "yes" ; then
- OPT_FEATURE_FLAGS+=" -DSQLITE_ENABLE_SESSION"
- OPT_FEATURE_FLAGS+=" -DSQLITE_ENABLE_PREUPDATE_HOOK"
+ OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_SESSION"
+ OPT_FEATURE_FLAGS="${OPT_FEATURE_FLAGS} -DSQLITE_ENABLE_PREUPDATE_HOOK"
fi
#########
-# attempt to duplicate any OMITS and ENABLES into the $(OPT_FEATURE_FLAGS) parameter
+# attempt to duplicate any OMITS and ENABLES into the ${OPT_FEATURE_FLAGS} parameter
for option in $CFLAGS $CPPFLAGS
do
case $option in
diff --git a/doc/lemon.html b/doc/lemon.html
index 114526f..f05c481 100644
--- a/doc/lemon.html
+++ b/doc/lemon.html
@@ -23,6 +23,26 @@ or embedded controllers.
This document is an introduction to the Lemon
parser generator.
+
Security Note
+
+
The language parser code created by Lemon is very robust and
+is well-suited for use in internet-facing applications that need to
+safely process maliciously crafted inputs.
+
+
The "lemon.exe" command-line tool itself works great when given a valid
+input grammar file and almost always gives helpful
+error messages for malformed inputs. However, it is possible for
+a malicious user to craft a grammar file that will cause
+lemon.exe to crash.
+We do not see this as a problem, as lemon.exe is not intended to be used
+with hostile inputs.
+To summarize:
+
+
+
Parser code generated by lemon → Robust and secure
+
The "lemon.exe" command line tool itself → Not so much
+
+
Theory of Operation
The main goal of Lemon is to translate a context free grammar (CFG)
diff --git a/ext/README.md b/ext/README.md
new file mode 100644
index 0000000..933a33d
--- /dev/null
+++ b/ext/README.md
@@ -0,0 +1,8 @@
+## Loadable Extensions
+
+Various [loadable extensions](https://www.sqlite.org/loadext.html) for
+SQLite are found in subfolders.
+
+Most subfolders are dedicated to a single loadable extension (for
+example FTS5, or RTREE). But the misc/ subfolder contains a collection
+of smaller single-file extensions.
diff --git a/ext/README.txt b/ext/README.txt
deleted file mode 100644
index 009495f..0000000
--- a/ext/README.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-Version loadable extensions to SQLite are found in subfolders
-of this folder.
diff --git a/ext/fts3/fts3.c b/ext/fts3/fts3.c
index 748faef..f514542 100644
--- a/ext/fts3/fts3.c
+++ b/ext/fts3/fts3.c
@@ -349,8 +349,9 @@ int sqlite3Fts3PutVarint(char *p, sqlite_int64 v){
** Return the number of bytes read, or 0 on error.
** The value is stored in *v.
*/
-int sqlite3Fts3GetVarint(const char *p, sqlite_int64 *v){
- const char *pStart = p;
+int sqlite3Fts3GetVarint(const char *pBuf, sqlite_int64 *v){
+ const unsigned char *p = (const unsigned char*)pBuf;
+ const unsigned char *pStart = p;
u32 a;
u64 b;
int shift;
@@ -371,8 +372,8 @@ int sqlite3Fts3GetVarint(const char *p, sqlite_int64 *v){
}
/*
-** Similar to sqlite3Fts3GetVarint(), except that the output is truncated to a
-** 32-bit integer before it is returned.
+** Similar to sqlite3Fts3GetVarint(), except that the output is truncated to
+** a non-negative 32-bit integer before it is returned.
*/
int sqlite3Fts3GetVarint32(const char *p, int *pi){
u32 a;
@@ -388,7 +389,9 @@ int sqlite3Fts3GetVarint32(const char *p, int *pi){
GETVARINT_STEP(a, p, 14, 0x3FFF, 0x200000, *pi, 3);
GETVARINT_STEP(a, p, 21, 0x1FFFFF, 0x10000000, *pi, 4);
a = (a & 0x0FFFFFFF );
- *pi = (int)(a | ((u32)(*p & 0x0F) << 28));
+ *pi = (int)(a | ((u32)(*p & 0x07) << 28));
+ assert( 0==(a & 0x80000000) );
+ assert( *pi>=0 );
return 5;
}
@@ -492,6 +495,7 @@ static int fts3DisconnectMethod(sqlite3_vtab *pVtab){
assert( p->pSegments==0 );
/* Free any prepared statements held */
+ sqlite3_finalize(p->pSeekStmt);
for(i=0; iaStmt); i++){
sqlite3_finalize(p->aStmt[i]);
}
@@ -1217,65 +1221,66 @@ static int fts3InitVtab(
break;
}
}
- if( iOpt==SizeofArray(aFts4Opt) ){
- sqlite3Fts3ErrMsg(pzErr, "unrecognized parameter: %s", z);
- rc = SQLITE_ERROR;
- }else{
- switch( iOpt ){
- case 0: /* MATCHINFO */
- if( strlen(zVal)!=4 || sqlite3_strnicmp(zVal, "fts3", 4) ){
- sqlite3Fts3ErrMsg(pzErr, "unrecognized matchinfo: %s", zVal);
- rc = SQLITE_ERROR;
- }
- bNoDocsize = 1;
- break;
+ switch( iOpt ){
+ case 0: /* MATCHINFO */
+ if( strlen(zVal)!=4 || sqlite3_strnicmp(zVal, "fts3", 4) ){
+ sqlite3Fts3ErrMsg(pzErr, "unrecognized matchinfo: %s", zVal);
+ rc = SQLITE_ERROR;
+ }
+ bNoDocsize = 1;
+ break;
- case 1: /* PREFIX */
- sqlite3_free(zPrefix);
- zPrefix = zVal;
- zVal = 0;
- break;
+ case 1: /* PREFIX */
+ sqlite3_free(zPrefix);
+ zPrefix = zVal;
+ zVal = 0;
+ break;
- case 2: /* COMPRESS */
- sqlite3_free(zCompress);
- zCompress = zVal;
- zVal = 0;
- break;
+ case 2: /* COMPRESS */
+ sqlite3_free(zCompress);
+ zCompress = zVal;
+ zVal = 0;
+ break;
- case 3: /* UNCOMPRESS */
- sqlite3_free(zUncompress);
- zUncompress = zVal;
- zVal = 0;
- break;
+ case 3: /* UNCOMPRESS */
+ sqlite3_free(zUncompress);
+ zUncompress = zVal;
+ zVal = 0;
+ break;
- case 4: /* ORDER */
- if( (strlen(zVal)!=3 || sqlite3_strnicmp(zVal, "asc", 3))
- && (strlen(zVal)!=4 || sqlite3_strnicmp(zVal, "desc", 4))
- ){
- sqlite3Fts3ErrMsg(pzErr, "unrecognized order: %s", zVal);
- rc = SQLITE_ERROR;
- }
- bDescIdx = (zVal[0]=='d' || zVal[0]=='D');
- break;
+ case 4: /* ORDER */
+ if( (strlen(zVal)!=3 || sqlite3_strnicmp(zVal, "asc", 3))
+ && (strlen(zVal)!=4 || sqlite3_strnicmp(zVal, "desc", 4))
+ ){
+ sqlite3Fts3ErrMsg(pzErr, "unrecognized order: %s", zVal);
+ rc = SQLITE_ERROR;
+ }
+ bDescIdx = (zVal[0]=='d' || zVal[0]=='D');
+ break;
- case 5: /* CONTENT */
- sqlite3_free(zContent);
- zContent = zVal;
- zVal = 0;
- break;
+ case 5: /* CONTENT */
+ sqlite3_free(zContent);
+ zContent = zVal;
+ zVal = 0;
+ break;
- case 6: /* LANGUAGEID */
- assert( iOpt==6 );
- sqlite3_free(zLanguageid);
- zLanguageid = zVal;
- zVal = 0;
- break;
+ case 6: /* LANGUAGEID */
+ assert( iOpt==6 );
+ sqlite3_free(zLanguageid);
+ zLanguageid = zVal;
+ zVal = 0;
+ break;
- case 7: /* NOTINDEXED */
- azNotindexed[nNotindexed++] = zVal;
- zVal = 0;
- break;
- }
+ case 7: /* NOTINDEXED */
+ azNotindexed[nNotindexed++] = zVal;
+ zVal = 0;
+ break;
+
+ default:
+ assert( iOpt==SizeofArray(aFts4Opt) );
+ sqlite3Fts3ErrMsg(pzErr, "unrecognized parameter: %s", z);
+ rc = SQLITE_ERROR;
+ break;
}
sqlite3_free(zVal);
}
@@ -1363,9 +1368,9 @@ static int fts3InitVtab(
p->pTokenizer = pTokenizer;
p->nMaxPendingData = FTS3_MAX_PENDING_DATA;
p->bHasDocsize = (isFts4 && bNoDocsize==0);
- p->bHasStat = isFts4;
- p->bFts4 = isFts4;
- p->bDescIdx = bDescIdx;
+ p->bHasStat = (u8)isFts4;
+ p->bFts4 = (u8)isFts4;
+ p->bDescIdx = (u8)bDescIdx;
p->nAutoincrmerge = 0xff; /* 0xff means setting unknown */
p->zContentTbl = zContent;
p->zLanguageid = zLanguageid;
@@ -1396,7 +1401,9 @@ static int fts3InitVtab(
char *z;
int n = 0;
z = (char *)sqlite3Fts3NextToken(aCol[iCol], &n);
- memcpy(zCsr, z, n);
+ if( n>0 ){
+ memcpy(zCsr, z, n);
+ }
zCsr[n] = '\0';
sqlite3Fts3Dequote(zCsr);
p->azColumn[iCol] = zCsr;
@@ -1680,6 +1687,39 @@ static int fts3OpenMethod(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCsr){
return SQLITE_OK;
}
+/*
+** Finalize the statement handle at pCsr->pStmt.
+**
+** Or, if that statement handle is one created by fts3CursorSeekStmt(),
+** and the Fts3Table.pSeekStmt slot is currently NULL, save the statement
+** pointer there instead of finalizing it.
+*/
+static void fts3CursorFinalizeStmt(Fts3Cursor *pCsr){
+ if( pCsr->bSeekStmt ){
+ Fts3Table *p = (Fts3Table *)pCsr->base.pVtab;
+ if( p->pSeekStmt==0 ){
+ p->pSeekStmt = pCsr->pStmt;
+ sqlite3_reset(pCsr->pStmt);
+ pCsr->pStmt = 0;
+ }
+ pCsr->bSeekStmt = 0;
+ }
+ sqlite3_finalize(pCsr->pStmt);
+}
+
+/*
+** Free all resources currently held by the cursor passed as the only
+** argument.
+*/
+static void fts3ClearCursor(Fts3Cursor *pCsr){
+ fts3CursorFinalizeStmt(pCsr);
+ sqlite3Fts3FreeDeferredTokens(pCsr);
+ sqlite3_free(pCsr->aDoclist);
+ sqlite3Fts3MIBufferFree(pCsr->pMIBuffer);
+ sqlite3Fts3ExprFree(pCsr->pExpr);
+ memset(&(&pCsr->base)[1], 0, sizeof(Fts3Cursor)-sizeof(sqlite3_vtab_cursor));
+}
+
/*
** Close the cursor. For additional information see the documentation
** on the xClose method of the virtual table interface.
@@ -1687,11 +1727,7 @@ static int fts3OpenMethod(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCsr){
static int fts3CloseMethod(sqlite3_vtab_cursor *pCursor){
Fts3Cursor *pCsr = (Fts3Cursor *)pCursor;
assert( ((Fts3Table *)pCsr->base.pVtab)->pSegments==0 );
- sqlite3_finalize(pCsr->pStmt);
- sqlite3Fts3ExprFree(pCsr->pExpr);
- sqlite3Fts3FreeDeferredTokens(pCsr);
- sqlite3_free(pCsr->aDoclist);
- sqlite3Fts3MIBufferFree(pCsr->pMIBuffer);
+ fts3ClearCursor(pCsr);
assert( ((Fts3Table *)pCsr->base.pVtab)->pSegments==0 );
sqlite3_free(pCsr);
return SQLITE_OK;
@@ -1705,20 +1741,23 @@ static int fts3CloseMethod(sqlite3_vtab_cursor *pCursor){
**
** (or the equivalent for a content=xxx table) and set pCsr->pStmt to
** it. If an error occurs, return an SQLite error code.
-**
-** Otherwise, set *ppStmt to point to pCsr->pStmt and return SQLITE_OK.
*/
-static int fts3CursorSeekStmt(Fts3Cursor *pCsr, sqlite3_stmt **ppStmt){
+static int fts3CursorSeekStmt(Fts3Cursor *pCsr){
int rc = SQLITE_OK;
if( pCsr->pStmt==0 ){
Fts3Table *p = (Fts3Table *)pCsr->base.pVtab;
char *zSql;
- zSql = sqlite3_mprintf("SELECT %s WHERE rowid = ?", p->zReadExprlist);
- if( !zSql ) return SQLITE_NOMEM;
- rc = sqlite3_prepare_v2(p->db, zSql, -1, &pCsr->pStmt, 0);
- sqlite3_free(zSql);
+ if( p->pSeekStmt ){
+ pCsr->pStmt = p->pSeekStmt;
+ p->pSeekStmt = 0;
+ }else{
+ zSql = sqlite3_mprintf("SELECT %s WHERE rowid = ?", p->zReadExprlist);
+ if( !zSql ) return SQLITE_NOMEM;
+ rc = sqlite3_prepare_v3(p->db, zSql,-1,SQLITE_PREPARE_PERSISTENT,&pCsr->pStmt,0);
+ sqlite3_free(zSql);
+ }
+ if( rc==SQLITE_OK ) pCsr->bSeekStmt = 1;
}
- *ppStmt = pCsr->pStmt;
return rc;
}
@@ -1730,9 +1769,7 @@ static int fts3CursorSeekStmt(Fts3Cursor *pCsr, sqlite3_stmt **ppStmt){
static int fts3CursorSeek(sqlite3_context *pContext, Fts3Cursor *pCsr){
int rc = SQLITE_OK;
if( pCsr->isRequireSeek ){
- sqlite3_stmt *pStmt = 0;
-
- rc = fts3CursorSeekStmt(pCsr, &pStmt);
+ rc = fts3CursorSeekStmt(pCsr);
if( rc==SQLITE_OK ){
sqlite3_bind_int64(pCsr->pStmt, 1, pCsr->iPrevId);
pCsr->isRequireSeek = 0;
@@ -1821,7 +1858,8 @@ static int fts3ScanInteriorNode(
isFirstTerm = 0;
zCsr += fts3GetVarint32(zCsr, &nSuffix);
- if( nPrefix<0 || nSuffix<0 || &zCsr[nSuffix]>zEnd ){
+ assert( nPrefix>=0 && nSuffix>=0 );
+ if( &zCsr[nSuffix]>zEnd ){
rc = FTS_CORRUPT_VTAB;
goto finish_scan;
}
@@ -2631,7 +2669,7 @@ int sqlite3Fts3FirstFilter(
fts3ColumnlistCopy(0, &p);
}
- while( ppStmt);
- sqlite3_free(pCsr->aDoclist);
- sqlite3Fts3MIBufferFree(pCsr->pMIBuffer);
- sqlite3Fts3ExprFree(pCsr->pExpr);
- memset(&pCursor[1], 0, sizeof(Fts3Cursor)-sizeof(sqlite3_vtab_cursor));
+ fts3ClearCursor(pCsr);
/* Set the lower and upper bounds on docids to return */
pCsr->iMinDocid = fts3DocidRange(pDocidGe, SMALLEST_INT64);
@@ -3252,13 +3286,13 @@ static int fts3FilterMethod(
);
}
if( zSql ){
- rc = sqlite3_prepare_v2(p->db, zSql, -1, &pCsr->pStmt, 0);
+ rc = sqlite3_prepare_v3(p->db,zSql,-1,SQLITE_PREPARE_PERSISTENT,&pCsr->pStmt,0);
sqlite3_free(zSql);
}else{
rc = SQLITE_NOMEM;
}
}else if( eSearch==FTS3_DOCID_SEARCH ){
- rc = fts3CursorSeekStmt(pCsr, &pCsr->pStmt);
+ rc = fts3CursorSeekStmt(pCsr);
if( rc==SQLITE_OK ){
rc = sqlite3_bind_value(pCsr->pStmt, 1, pCons);
}
@@ -3273,7 +3307,12 @@ static int fts3FilterMethod(
** routine to find out if it has reached the end of a result set.
*/
static int fts3EofMethod(sqlite3_vtab_cursor *pCursor){
- return ((Fts3Cursor *)pCursor)->isEof;
+ Fts3Cursor *pCsr = (Fts3Cursor*)pCursor;
+ if( pCsr->isEof ){
+ fts3ClearCursor(pCsr);
+ pCsr->isEof = 1;
+ }
+ return pCsr->isEof;
}
/*
@@ -3311,33 +3350,37 @@ static int fts3ColumnMethod(
/* The column value supplied by SQLite must be in range. */
assert( iCol>=0 && iCol<=p->nColumn+2 );
- if( iCol==p->nColumn+1 ){
- /* This call is a request for the "docid" column. Since "docid" is an
- ** alias for "rowid", use the xRowid() method to obtain the value.
- */
- sqlite3_result_int64(pCtx, pCsr->iPrevId);
- }else if( iCol==p->nColumn ){
- /* The extra column whose name is the same as the table.
- ** Return a blob which is a pointer to the cursor. */
- sqlite3_result_blob(pCtx, &pCsr, sizeof(pCsr), SQLITE_TRANSIENT);
- }else if( iCol==p->nColumn+2 && pCsr->pExpr ){
- sqlite3_result_int64(pCtx, pCsr->iLangid);
- }else{
- /* The requested column is either a user column (one that contains
- ** indexed data), or the language-id column. */
- rc = fts3CursorSeek(0, pCsr);
+ switch( iCol-p->nColumn ){
+ case 0:
+ /* The special 'table-name' column */
+ sqlite3_result_pointer(pCtx, pCsr, "fts3cursor", 0);
+ break;
- if( rc==SQLITE_OK ){
- if( iCol==p->nColumn+2 ){
- int iLangid = 0;
- if( p->zLanguageid ){
- iLangid = sqlite3_column_int(pCsr->pStmt, p->nColumn+1);
- }
- sqlite3_result_int(pCtx, iLangid);
- }else if( sqlite3_data_count(pCsr->pStmt)>(iCol+1) ){
+ case 1:
+ /* The docid column */
+ sqlite3_result_int64(pCtx, pCsr->iPrevId);
+ break;
+
+ case 2:
+ if( pCsr->pExpr ){
+ sqlite3_result_int64(pCtx, pCsr->iLangid);
+ break;
+ }else if( p->zLanguageid==0 ){
+ sqlite3_result_int(pCtx, 0);
+ break;
+ }else{
+ iCol = p->nColumn;
+ /* fall-through */
+ }
+
+ default:
+ /* A user column. Or, if this is a full-table scan, possibly the
+ ** language-id column. Seek the cursor. */
+ rc = fts3CursorSeek(0, pCsr);
+ if( rc==SQLITE_OK && sqlite3_data_count(pCsr->pStmt)-1>iCol ){
sqlite3_result_value(pCtx, sqlite3_column_value(pCsr->pStmt, iCol+1));
}
- }
+ break;
}
assert( ((Fts3Table *)pCsr->base.pVtab)->pSegments==0 );
@@ -3386,8 +3429,10 @@ static int fts3SyncMethod(sqlite3_vtab *pVtab){
const u32 nMinMerge = 64; /* Minimum amount of incr-merge work to do */
Fts3Table *p = (Fts3Table*)pVtab;
- int rc = sqlite3Fts3PendingTermsFlush(p);
+ int rc;
+ i64 iLastRowid = sqlite3_last_insert_rowid(p->db);
+ rc = sqlite3Fts3PendingTermsFlush(p);
if( rc==SQLITE_OK
&& p->nLeafAdd>(nMinMerge/16)
&& p->nAutoincrmerge && p->nAutoincrmerge!=0xff
@@ -3402,6 +3447,7 @@ static int fts3SyncMethod(sqlite3_vtab *pVtab){
if( A>(int)nMinMerge ) rc = sqlite3Fts3Incrmerge(p, A, p->nAutoincrmerge);
}
sqlite3Fts3SegmentsClose(p);
+ sqlite3_set_last_insert_rowid(p->db, iLastRowid);
return rc;
}
@@ -3414,17 +3460,11 @@ static int fts3SyncMethod(sqlite3_vtab *pVtab){
static int fts3SetHasStat(Fts3Table *p){
int rc = SQLITE_OK;
if( p->bHasStat==2 ){
- const char *zFmt ="SELECT 1 FROM %Q.sqlite_master WHERE tbl_name='%q_stat'";
- char *zSql = sqlite3_mprintf(zFmt, p->zDb, p->zName);
- if( zSql ){
- sqlite3_stmt *pStmt = 0;
- rc = sqlite3_prepare_v2(p->db, zSql, -1, &pStmt, 0);
- if( rc==SQLITE_OK ){
- int bHasStat = (sqlite3_step(pStmt)==SQLITE_ROW);
- rc = sqlite3_finalize(pStmt);
- if( rc==SQLITE_OK ) p->bHasStat = bHasStat;
- }
- sqlite3_free(zSql);
+ char *zTbl = sqlite3_mprintf("%s_stat", p->zName);
+ if( zTbl ){
+ int res = sqlite3_table_column_metadata(p->db, p->zDb, zTbl, 0,0,0,0,0,0);
+ sqlite3_free(zTbl);
+ p->bHasStat = (res==SQLITE_OK);
}else{
rc = SQLITE_NOMEM;
}
@@ -3531,18 +3571,17 @@ static int fts3FunctionArg(
sqlite3_value *pVal, /* argv[0] passed to function */
Fts3Cursor **ppCsr /* OUT: Store cursor handle here */
){
- Fts3Cursor *pRet;
- if( sqlite3_value_type(pVal)!=SQLITE_BLOB
- || sqlite3_value_bytes(pVal)!=sizeof(Fts3Cursor *)
- ){
+ int rc;
+ *ppCsr = (Fts3Cursor*)sqlite3_value_pointer(pVal, "fts3cursor");
+ if( (*ppCsr)!=0 ){
+ rc = SQLITE_OK;
+ }else{
char *zErr = sqlite3_mprintf("illegal first argument to %s", zFunc);
sqlite3_result_error(pContext, zErr, -1);
sqlite3_free(zErr);
- return SQLITE_ERROR;
+ rc = SQLITE_ERROR;
}
- memcpy(&pRet, sqlite3_value_blob(pVal), sizeof(Fts3Cursor *));
- *ppCsr = pRet;
- return SQLITE_OK;
+ return rc;
}
/*
@@ -3929,7 +3968,7 @@ int sqlite3Fts3Init(sqlite3 *db){
#endif
/* Create the virtual table wrapper around the hash-table and overload
- ** the two scalar functions. If this is successful, register the
+ ** the four scalar functions. If this is successful, register the
** module with sqlite.
*/
if( SQLITE_OK==rc
@@ -4512,7 +4551,7 @@ static int fts3EvalIncrPhraseNext(
** one incremental token. In which case the bIncr flag is set. */
assert( p->bIncr==1 );
- if( p->nToken==1 && p->bIncr ){
+ if( p->nToken==1 ){
rc = sqlite3Fts3MsrIncrNext(pTab, p->aToken[0].pSegcsr,
&pDL->iDocid, &pDL->pList, &pDL->nList
);
@@ -4745,6 +4784,7 @@ static void fts3EvalTokenCosts(
** the number of overflow pages consumed by a record B bytes in size.
*/
static int fts3EvalAverageDocsize(Fts3Cursor *pCsr, int *pnPage){
+ int rc = SQLITE_OK;
if( pCsr->nRowAvg==0 ){
/* The average document size, which is required to calculate the cost
** of each doclist, has not yet been determined. Read the required
@@ -4757,7 +4797,6 @@ static int fts3EvalAverageDocsize(Fts3Cursor *pCsr, int *pnPage){
** data stored in all rows of each column of the table, from left
** to right.
*/
- int rc;
Fts3Table *p = (Fts3Table*)pCsr->base.pVtab;
sqlite3_stmt *pStmt;
sqlite3_int64 nDoc = 0;
@@ -4784,11 +4823,10 @@ static int fts3EvalAverageDocsize(Fts3Cursor *pCsr, int *pnPage){
pCsr->nRowAvg = (int)(((nByte / nDoc) + p->nPgsz) / p->nPgsz);
assert( pCsr->nRowAvg>0 );
rc = sqlite3_reset(pStmt);
- if( rc!=SQLITE_OK ) return rc;
}
*pnPage = pCsr->nRowAvg;
- return SQLITE_OK;
+ return rc;
}
/*
@@ -5138,7 +5176,8 @@ static void fts3EvalNextRow(
pExpr->iDocid = pLeft->iDocid;
pExpr->bEof = (pLeft->bEof || pRight->bEof);
if( pExpr->eType==FTSQUERY_NEAR && pExpr->bEof ){
- if( pRight->pPhrase && pRight->pPhrase->doclist.aAll ){
+ assert( pRight->eType==FTSQUERY_PHRASE );
+ if( pRight->pPhrase->doclist.aAll ){
Fts3Doclist *pDl = &pRight->pPhrase->doclist;
while( *pRc==SQLITE_OK && pRight->bEof==0 ){
memset(pDl->pList, 0, pDl->nList);
@@ -5167,7 +5206,7 @@ static void fts3EvalNextRow(
if( pRight->bEof || (pLeft->bEof==0 && iCmp<0) ){
fts3EvalNextRow(pCsr, pLeft, pRc);
- }else if( pLeft->bEof || (pRight->bEof==0 && iCmp>0) ){
+ }else if( pLeft->bEof || iCmp>0 ){
fts3EvalNextRow(pCsr, pRight, pRc);
}else{
fts3EvalNextRow(pCsr, pLeft, pRc);
@@ -5259,7 +5298,6 @@ static int fts3EvalNearTest(Fts3Expr *pExpr, int *pRc){
*/
if( *pRc==SQLITE_OK
&& pExpr->eType==FTSQUERY_NEAR
- && pExpr->bEof==0
&& (pExpr->pParent==0 || pExpr->pParent->eType!=FTSQUERY_NEAR)
){
Fts3Expr *p;
@@ -5268,42 +5306,39 @@ static int fts3EvalNearTest(Fts3Expr *pExpr, int *pRc){
/* Allocate temporary working space. */
for(p=pExpr; p->pLeft; p=p->pLeft){
+ assert( p->pRight->pPhrase->doclist.nList>0 );
nTmp += p->pRight->pPhrase->doclist.nList;
}
nTmp += p->pPhrase->doclist.nList;
- if( nTmp==0 ){
+ aTmp = sqlite3_malloc(nTmp*2);
+ if( !aTmp ){
+ *pRc = SQLITE_NOMEM;
res = 0;
}else{
- aTmp = sqlite3_malloc(nTmp*2);
- if( !aTmp ){
- *pRc = SQLITE_NOMEM;
- res = 0;
- }else{
- char *aPoslist = p->pPhrase->doclist.pList;
- int nToken = p->pPhrase->nToken;
+ char *aPoslist = p->pPhrase->doclist.pList;
+ int nToken = p->pPhrase->nToken;
- for(p=p->pParent;res && p && p->eType==FTSQUERY_NEAR; p=p->pParent){
- Fts3Phrase *pPhrase = p->pRight->pPhrase;
- int nNear = p->nNear;
- res = fts3EvalNearTrim(nNear, aTmp, &aPoslist, &nToken, pPhrase);
- }
-
- aPoslist = pExpr->pRight->pPhrase->doclist.pList;
- nToken = pExpr->pRight->pPhrase->nToken;
- for(p=pExpr->pLeft; p && res; p=p->pLeft){
- int nNear;
- Fts3Phrase *pPhrase;
- assert( p->pParent && p->pParent->pLeft==p );
- nNear = p->pParent->nNear;
- pPhrase = (
- p->eType==FTSQUERY_NEAR ? p->pRight->pPhrase : p->pPhrase
- );
- res = fts3EvalNearTrim(nNear, aTmp, &aPoslist, &nToken, pPhrase);
- }
+ for(p=p->pParent;res && p && p->eType==FTSQUERY_NEAR; p=p->pParent){
+ Fts3Phrase *pPhrase = p->pRight->pPhrase;
+ int nNear = p->nNear;
+ res = fts3EvalNearTrim(nNear, aTmp, &aPoslist, &nToken, pPhrase);
}
- sqlite3_free(aTmp);
+ aPoslist = pExpr->pRight->pPhrase->doclist.pList;
+ nToken = pExpr->pRight->pPhrase->nToken;
+ for(p=pExpr->pLeft; p && res; p=p->pLeft){
+ int nNear;
+ Fts3Phrase *pPhrase;
+ assert( p->pParent && p->pParent->pLeft==p );
+ nNear = p->pParent->nNear;
+ pPhrase = (
+ p->eType==FTSQUERY_NEAR ? p->pRight->pPhrase : p->pPhrase
+ );
+ res = fts3EvalNearTrim(nNear, aTmp, &aPoslist, &nToken, pPhrase);
+ }
}
+
+ sqlite3_free(aTmp);
}
return res;
diff --git a/ext/fts3/fts3Int.h b/ext/fts3/fts3Int.h
index 0c86c42..c3cab9d 100644
--- a/ext/fts3/fts3Int.h
+++ b/ext/fts3/fts3Int.h
@@ -230,6 +230,7 @@ struct Fts3Table {
** statements is run and reset within a single virtual table API call.
*/
sqlite3_stmt *aStmt[40];
+ sqlite3_stmt *pSeekStmt; /* Cache for fts3CursorSeekStmt() */
char *zReadExprlist;
char *zWriteExprlist;
@@ -299,6 +300,7 @@ struct Fts3Cursor {
i16 eSearch; /* Search strategy (see below) */
u8 isEof; /* True if at End Of Results */
u8 isRequireSeek; /* True if must seek pStmt to %_content row */
+ u8 bSeekStmt; /* True if pStmt is a seek */
sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */
Fts3Expr *pExpr; /* Parsed MATCH query string */
int iLangid; /* Language being queried for */
diff --git a/ext/fts3/fts3_unicode.c b/ext/fts3/fts3_unicode.c
index 94fc27b..dfb2680 100644
--- a/ext/fts3/fts3_unicode.c
+++ b/ext/fts3/fts3_unicode.c
@@ -136,16 +136,16 @@ static int unicodeAddExceptions(
){
const unsigned char *z = (const unsigned char *)zIn;
const unsigned char *zTerm = &z[nIn];
- int iCode;
+ unsigned int iCode;
int nEntry = 0;
assert( bAlnum==0 || bAlnum==1 );
while( zi; j--) aNew[j] = aNew[j-1];
- aNew[i] = iCode;
+ aNew[i] = (int)iCode;
nNew++;
}
}
@@ -318,7 +318,7 @@ static int unicodeNext(
){
unicode_cursor *pCsr = (unicode_cursor *)pC;
unicode_tokenizer *p = ((unicode_tokenizer *)pCsr->base.pTokenizer);
- int iCode = 0;
+ unsigned int iCode = 0;
char *zOut;
const unsigned char *z = &pCsr->aInput[pCsr->iOff];
const unsigned char *zStart = z;
@@ -330,7 +330,7 @@ static int unicodeNext(
** the input. */
while( z=zTerm ) return SQLITE_DONE;
@@ -350,7 +350,7 @@ static int unicodeNext(
/* Write the folded case of the last character read to the output */
zEnd = z;
- iOut = sqlite3FtsUnicodeFold(iCode, p->bRemoveDiacritic);
+ iOut = sqlite3FtsUnicodeFold((int)iCode, p->bRemoveDiacritic);
if( iOut ){
WRITE_UTF8(zOut, iOut);
}
@@ -358,8 +358,8 @@ static int unicodeNext(
/* If the cursor is not at EOF, read the next character */
if( z>=zTerm ) break;
READ_UTF8(z, zTerm, iCode);
- }while( unicodeIsAlnum(p, iCode)
- || sqlite3FtsUnicodeIsdiacritic(iCode)
+ }while( unicodeIsAlnum(p, (int)iCode)
+ || sqlite3FtsUnicodeIsdiacritic((int)iCode)
);
/* Set the output variables and return. */
diff --git a/ext/fts3/fts3_unicode2.c b/ext/fts3/fts3_unicode2.c
index 20b7a25..da7251e 100644
--- a/ext/fts3/fts3_unicode2.c
+++ b/ext/fts3/fts3_unicode2.c
@@ -127,9 +127,9 @@ int sqlite3FtsUnicodeIsalnum(int c){
0xFFFFFFFF, 0xFC00FFFF, 0xF8000001, 0xF8000001,
};
- if( c<128 ){
- return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 );
- }else if( c<(1<<22) ){
+ if( (unsigned int)c<128 ){
+ return ( (aAscii[c >> 5] & ((unsigned int)1 << (c & 0x001F)))==0 );
+ }else if( (unsigned int)c<(1<<22) ){
unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
int iRes = 0;
int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
@@ -322,16 +322,17 @@ int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){
int ret = c;
- assert( c>=0 );
assert( sizeof(unsigned short)==2 && sizeof(unsigned char)==1 );
if( c<128 ){
if( c>='A' && c<='Z' ) ret = c + ('a' - 'A');
}else if( c<65536 ){
+ const struct TableEntry *p;
int iHi = sizeof(aEntry)/sizeof(aEntry[0]) - 1;
int iLo = 0;
int iRes = -1;
+ assert( c>aEntry[0].iCode );
while( iHi>=iLo ){
int iTest = (iHi + iLo) / 2;
int cmp = (c - aEntry[iTest].iCode);
@@ -342,14 +343,12 @@ int sqlite3FtsUnicodeFold(int c, int bRemoveDiacritic){
iHi = iTest-1;
}
}
- assert( iRes<0 || c>=aEntry[iRes].iCode );
- if( iRes>=0 ){
- const struct TableEntry *p = &aEntry[iRes];
- if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
- ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
- assert( ret>0 );
- }
+ assert( iRes>=0 && c>=aEntry[iRes].iCode );
+ p = &aEntry[iRes];
+ if( c<(p->iCode + p->nRange) && 0==(0x01 & p->flags & (p->iCode ^ c)) ){
+ ret = (c + (aiOff[p->flags>>1])) & 0x0000FFFF;
+ assert( ret>0 );
}
if( bRemoveDiacritic ) ret = remove_diacritic(ret);
diff --git a/ext/fts3/fts3_write.c b/ext/fts3/fts3_write.c
index 3ff481b..daf3399 100644
--- a/ext/fts3/fts3_write.c
+++ b/ext/fts3/fts3_write.c
@@ -407,7 +407,8 @@ static int fts3SqlStmt(
if( !zSql ){
rc = SQLITE_NOMEM;
}else{
- rc = sqlite3_prepare_v2(p->db, zSql, -1, &pStmt, NULL);
+ rc = sqlite3_prepare_v3(p->db, zSql, -1, SQLITE_PREPARE_PERSISTENT,
+ &pStmt, NULL);
sqlite3_free(zSql);
assert( rc==SQLITE_OK || pStmt==0 );
p->aStmt[eStmt] = pStmt;
@@ -4956,11 +4957,14 @@ int sqlite3Fts3Incrmerge(Fts3Table *p, int nMerge, int nMin){
** Convert the text beginning at *pz into an integer and return
** its value. Advance *pz to point to the first character past
** the integer.
+**
+** This function used for parameters to merge= and incrmerge=
+** commands.
*/
static int fts3Getint(const char **pz){
const char *z = *pz;
int i = 0;
- while( (*z)>='0' && (*z)<='9' ) i = 10*i + *(z++) - '0';
+ while( (*z)>='0' && (*z)<='9' && i<214748363 ) i = 10*i + *(z++) - '0';
*pz = z;
return i;
}
diff --git a/ext/fts3/tool/fts3cov.sh b/ext/fts3/tool/fts3cov.sh
new file mode 100644
index 0000000..b1f34dc
--- /dev/null
+++ b/ext/fts3/tool/fts3cov.sh
@@ -0,0 +1,16 @@
+#!/bin/sh
+
+set -e
+
+srcdir=`dirname $(dirname $(dirname $(dirname $0)))`
+./testfixture $srcdir/test/fts3.test --output=fts3cov-out.txt
+
+echo ""
+
+for f in `ls $srcdir/ext/fts3/*.c`
+do
+ f=`basename $f`
+ echo -ne "$f: "
+ gcov -b $f | grep Taken | sed 's/Taken at least once://'
+done
+
diff --git a/ext/fts3/unicode/mkunicode.tcl b/ext/fts3/unicode/mkunicode.tcl
index aafb4e9..de89099 100644
--- a/ext/fts3/unicode/mkunicode.tcl
+++ b/ext/fts3/unicode/mkunicode.tcl
@@ -227,7 +227,7 @@ proc print_isalnum {zFunc lRange} {
an_print_ascii_bitmap $lRange
puts {
if( (unsigned int)c<128 ){
- return ( (aAscii[c >> 5] & (1 << (c & 0x001F)))==0 );
+ return ( (aAscii[c >> 5] & ((unsigned int)1 << (c & 0x001F)))==0 );
}else if( (unsigned int)c<(1<<22) ){
unsigned int key = (((unsigned int)c)<<10) | 0x000003FF;
int iRes = 0;
diff --git a/ext/fts5/fts5Int.h b/ext/fts5/fts5Int.h
index 9ef338e..63dc082 100644
--- a/ext/fts5/fts5Int.h
+++ b/ext/fts5/fts5Int.h
@@ -30,7 +30,9 @@ typedef short i16;
typedef sqlite3_int64 i64;
typedef sqlite3_uint64 u64;
-#define ArraySize(x) ((int)(sizeof(x) / sizeof(x[0])))
+#ifndef ArraySize
+# define ArraySize(x) ((int)(sizeof(x) / sizeof(x[0])))
+#endif
#define testcase(x)
#define ALWAYS(x) 1
@@ -444,9 +446,9 @@ int sqlite3Fts5IndexBeginWrite(
/*
** Flush any data stored in the in-memory hash tables to the database.
-** If the bCommit flag is true, also close any open blob handles.
+** Also close any open blob handles.
*/
-int sqlite3Fts5IndexSync(Fts5Index *p, int bCommit);
+int sqlite3Fts5IndexSync(Fts5Index *p);
/*
** Discard any data stored in the in-memory hash tables. Do not write it
@@ -616,7 +618,7 @@ int sqlite3Fts5StorageDocsize(Fts5Storage *p, i64 iRowid, int *aCol);
int sqlite3Fts5StorageSize(Fts5Storage *p, int iCol, i64 *pnAvg);
int sqlite3Fts5StorageRowCount(Fts5Storage *p, i64 *pnRow);
-int sqlite3Fts5StorageSync(Fts5Storage *p, int bCommit);
+int sqlite3Fts5StorageSync(Fts5Storage *p);
int sqlite3Fts5StorageRollback(Fts5Storage *p);
int sqlite3Fts5StorageConfigValue(
@@ -652,6 +654,7 @@ struct Fts5Token {
/* Parse a MATCH expression. */
int sqlite3Fts5ExprNew(
Fts5Config *pConfig,
+ int iCol, /* Column on LHS of MATCH operator */
const char *zExpr,
Fts5Expr **ppNew,
char **pzErr
@@ -736,7 +739,7 @@ void sqlite3Fts5ParseNearsetFree(Fts5ExprNearset*);
void sqlite3Fts5ParseNodeFree(Fts5ExprNode*);
void sqlite3Fts5ParseSetDistance(Fts5Parse*, Fts5ExprNearset*, Fts5Token*);
-void sqlite3Fts5ParseSetColset(Fts5Parse*, Fts5ExprNearset*, Fts5Colset*);
+void sqlite3Fts5ParseSetColset(Fts5Parse*, Fts5ExprNode*, Fts5Colset*);
Fts5Colset *sqlite3Fts5ParseColsetInvert(Fts5Parse*, Fts5Colset*);
void sqlite3Fts5ParseFinished(Fts5Parse *pParse, Fts5ExprNode *p);
void sqlite3Fts5ParseNear(Fts5Parse *pParse, Fts5Token*);
diff --git a/ext/fts5/fts5_buffer.c b/ext/fts5/fts5_buffer.c
index 0098846..b116897 100644
--- a/ext/fts5/fts5_buffer.c
+++ b/ext/fts5/fts5_buffer.c
@@ -67,9 +67,11 @@ void sqlite3Fts5BufferAppendBlob(
const u8 *pData
){
assert_nc( *pRc || nData>=0 );
- if( fts5BufferGrow(pRc, pBuf, nData) ) return;
- memcpy(&pBuf->p[pBuf->n], pData, nData);
- pBuf->n += nData;
+ if( nData ){
+ if( fts5BufferGrow(pRc, pBuf, nData) ) return;
+ memcpy(&pBuf->p[pBuf->n], pData, nData);
+ pBuf->n += nData;
+ }
}
/*
@@ -246,8 +248,8 @@ void *sqlite3Fts5MallocZero(int *pRc, int nByte){
void *pRet = 0;
if( *pRc==SQLITE_OK ){
pRet = sqlite3_malloc(nByte);
- if( pRet==0 && nByte>0 ){
- *pRc = SQLITE_NOMEM;
+ if( pRet==0 ){
+ if( nByte>0 ) *pRc = SQLITE_NOMEM;
}else{
memset(pRet, 0, nByte);
}
diff --git a/ext/fts5/fts5_expr.c b/ext/fts5/fts5_expr.c
index bde9eae..aa7141c 100644
--- a/ext/fts5/fts5_expr.c
+++ b/ext/fts5/fts5_expr.c
@@ -213,6 +213,7 @@ static void fts5ParseFree(void *p){ sqlite3_free(p); }
int sqlite3Fts5ExprNew(
Fts5Config *pConfig, /* FTS5 Configuration */
+ int iCol,
const char *zExpr, /* Expression text */
Fts5Expr **ppNew,
char **pzErr
@@ -237,6 +238,18 @@ int sqlite3Fts5ExprNew(
}while( sParse.rc==SQLITE_OK && t!=FTS5_EOF );
sqlite3Fts5ParserFree(pEngine, fts5ParseFree);
+ /* If the LHS of the MATCH expression was a user column, apply the
+ ** implicit column-filter. */
+ if( iColnCol && sParse.pExpr && sParse.rc==SQLITE_OK ){
+ int n = sizeof(Fts5Colset);
+ Fts5Colset *pColset = (Fts5Colset*)sqlite3Fts5MallocZero(&sParse.rc, n);
+ if( pColset ){
+ pColset->nCol = 1;
+ pColset->aiCol[0] = iCol;
+ sqlite3Fts5ParseSetColset(&sParse, sParse.pExpr, pColset);
+ }
+ }
+
assert( sParse.rc!=SQLITE_OK || sParse.zErr==0 );
if( sParse.rc==SQLITE_OK ){
*ppNew = pNew = sqlite3_malloc(sizeof(Fts5Expr));
@@ -746,48 +759,61 @@ static int fts5ExprNearTest(
** Initialize all term iterators in the pNear object. If any term is found
** to match no documents at all, return immediately without initializing any
** further iterators.
+**
+** If an error occurs, return an SQLite error code. Otherwise, return
+** SQLITE_OK. It is not considered an error if some term matches zero
+** documents.
*/
static int fts5ExprNearInitAll(
Fts5Expr *pExpr,
Fts5ExprNode *pNode
){
Fts5ExprNearset *pNear = pNode->pNear;
- int i, j;
- int rc = SQLITE_OK;
- int bEof = 1;
+ int i;
assert( pNode->bNomatch==0 );
- for(i=0; rc==SQLITE_OK && inPhrase; i++){
+ for(i=0; inPhrase; i++){
Fts5ExprPhrase *pPhrase = pNear->apPhrase[i];
- for(j=0; jnTerm; j++){
- Fts5ExprTerm *pTerm = &pPhrase->aTerm[j];
- Fts5ExprTerm *p;
+ if( pPhrase->nTerm==0 ){
+ pNode->bEof = 1;
+ return SQLITE_OK;
+ }else{
+ int j;
+ for(j=0; jnTerm; j++){
+ Fts5ExprTerm *pTerm = &pPhrase->aTerm[j];
+ Fts5ExprTerm *p;
+ int bHit = 0;
- for(p=pTerm; p && rc==SQLITE_OK; p=p->pSynonym){
- if( p->pIter ){
- sqlite3Fts5IterClose(p->pIter);
- p->pIter = 0;
+ for(p=pTerm; p; p=p->pSynonym){
+ int rc;
+ if( p->pIter ){
+ sqlite3Fts5IterClose(p->pIter);
+ p->pIter = 0;
+ }
+ rc = sqlite3Fts5IndexQuery(
+ pExpr->pIndex, p->zTerm, (int)strlen(p->zTerm),
+ (pTerm->bPrefix ? FTS5INDEX_QUERY_PREFIX : 0) |
+ (pExpr->bDesc ? FTS5INDEX_QUERY_DESC : 0),
+ pNear->pColset,
+ &p->pIter
+ );
+ assert( (rc==SQLITE_OK)==(p->pIter!=0) );
+ if( rc!=SQLITE_OK ) return rc;
+ if( 0==sqlite3Fts5IterEof(p->pIter) ){
+ bHit = 1;
+ }
}
- rc = sqlite3Fts5IndexQuery(
- pExpr->pIndex, p->zTerm, (int)strlen(p->zTerm),
- (pTerm->bPrefix ? FTS5INDEX_QUERY_PREFIX : 0) |
- (pExpr->bDesc ? FTS5INDEX_QUERY_DESC : 0),
- pNear->pColset,
- &p->pIter
- );
- assert( rc==SQLITE_OK || p->pIter==0 );
- if( p->pIter && 0==sqlite3Fts5IterEof(p->pIter) ){
- bEof = 0;
+
+ if( bHit==0 ){
+ pNode->bEof = 1;
+ return SQLITE_OK;
}
}
-
- if( bEof ) break;
}
- if( bEof ) break;
}
- pNode->bEof = bEof;
- return rc;
+ pNode->bEof = 0;
+ return SQLITE_OK;
}
/*
@@ -1097,7 +1123,10 @@ static int fts5ExprNodeNext_OR(
|| (bFromValid && fts5RowidCmp(pExpr, p1->iRowid, iFrom)<0)
){
int rc = fts5ExprNodeNext(pExpr, p1, bFromValid, iFrom);
- if( rc!=SQLITE_OK ) return rc;
+ if( rc!=SQLITE_OK ){
+ pNode->bNomatch = 0;
+ return rc;
+ }
}
}
}
@@ -1128,7 +1157,10 @@ static int fts5ExprNodeTest_AND(
if( cmp>0 ){
/* Advance pChild until it points to iLast or laster */
rc = fts5ExprNodeNext(pExpr, pChild, 1, iLast);
- if( rc!=SQLITE_OK ) return rc;
+ if( rc!=SQLITE_OK ){
+ pAnd->bNomatch = 0;
+ return rc;
+ }
}
/* If the child node is now at EOF, so is the parent AND node. Otherwise,
@@ -1167,6 +1199,8 @@ static int fts5ExprNodeNext_AND(
int rc = fts5ExprNodeNext(pExpr, pNode->apChild[0], bFromValid, iFrom);
if( rc==SQLITE_OK ){
rc = fts5ExprNodeTest_AND(pExpr, pNode);
+ }else{
+ pNode->bNomatch = 0;
}
return rc;
}
@@ -1209,6 +1243,9 @@ static int fts5ExprNodeNext_NOT(
if( rc==SQLITE_OK ){
rc = fts5ExprNodeTest_NOT(pExpr, pNode);
}
+ if( rc!=SQLITE_OK ){
+ pNode->bNomatch = 0;
+ }
return rc;
}
@@ -1331,7 +1368,10 @@ int sqlite3Fts5ExprFirst(Fts5Expr *p, Fts5Index *pIdx, i64 iFirst, int bDesc){
/* If not at EOF but the current rowid occurs earlier than iFirst in
** the iteration order, move to document iFirst or later. */
- if( pRoot->bEof==0 && fts5RowidCmp(p, pRoot->iRowid, iFirst)<0 ){
+ if( rc==SQLITE_OK
+ && 0==pRoot->bEof
+ && fts5RowidCmp(p, pRoot->iRowid, iFirst)<0
+ ){
rc = fts5ExprNodeNext(p, pRoot, 1, iFirst);
}
@@ -1585,7 +1625,7 @@ Fts5ExprPhrase *sqlite3Fts5ParseTerm(
rc = fts5ParseStringFromToken(pToken, &z);
if( rc==SQLITE_OK ){
- int flags = FTS5_TOKENIZE_QUERY | (bPrefix ? FTS5_TOKENIZE_QUERY : 0);
+ int flags = FTS5_TOKENIZE_QUERY | (bPrefix ? FTS5_TOKENIZE_PREFIX : 0);
int n;
sqlite3Fts5Dequote(z);
n = (int)strlen(z);
@@ -1859,25 +1899,110 @@ Fts5Colset *sqlite3Fts5ParseColset(
return pRet;
}
+/*
+** If argument pOrig is NULL, or if (*pRc) is set to anything other than
+** SQLITE_OK when this function is called, NULL is returned.
+**
+** Otherwise, a copy of (*pOrig) is made into memory obtained from
+** sqlite3Fts5MallocZero() and a pointer to it returned. If the allocation
+** fails, (*pRc) is set to SQLITE_NOMEM and NULL is returned.
+*/
+static Fts5Colset *fts5CloneColset(int *pRc, Fts5Colset *pOrig){
+ Fts5Colset *pRet;
+ if( pOrig ){
+ int nByte = sizeof(Fts5Colset) + (pOrig->nCol-1) * sizeof(int);
+ pRet = (Fts5Colset*)sqlite3Fts5MallocZero(pRc, nByte);
+ if( pRet ){
+ memcpy(pRet, pOrig, nByte);
+ }
+ }else{
+ pRet = 0;
+ }
+ return pRet;
+}
+
+/*
+** Remove from colset pColset any columns that are not also in colset pMerge.
+*/
+static void fts5MergeColset(Fts5Colset *pColset, Fts5Colset *pMerge){
+ int iIn = 0; /* Next input in pColset */
+ int iMerge = 0; /* Next input in pMerge */
+ int iOut = 0; /* Next output slot in pColset */
+
+ while( iInnCol && iMergenCol ){
+ int iDiff = pColset->aiCol[iIn] - pMerge->aiCol[iMerge];
+ if( iDiff==0 ){
+ pColset->aiCol[iOut++] = pMerge->aiCol[iMerge];
+ iMerge++;
+ iIn++;
+ }else if( iDiff>0 ){
+ iMerge++;
+ }else{
+ iIn++;
+ }
+ }
+ pColset->nCol = iOut;
+}
+
+/*
+** Recursively apply colset pColset to expression node pNode and all of
+** its decendents. If (*ppFree) is not NULL, it contains a spare copy
+** of pColset. This function may use the spare copy and set (*ppFree) to
+** zero, or it may create copies of pColset using fts5CloneColset().
+*/
+static void fts5ParseSetColset(
+ Fts5Parse *pParse,
+ Fts5ExprNode *pNode,
+ Fts5Colset *pColset,
+ Fts5Colset **ppFree
+){
+ if( pParse->rc==SQLITE_OK ){
+ assert( pNode->eType==FTS5_TERM || pNode->eType==FTS5_STRING
+ || pNode->eType==FTS5_AND || pNode->eType==FTS5_OR
+ || pNode->eType==FTS5_NOT || pNode->eType==FTS5_EOF
+ );
+ if( pNode->eType==FTS5_STRING || pNode->eType==FTS5_TERM ){
+ Fts5ExprNearset *pNear = pNode->pNear;
+ if( pNear->pColset ){
+ fts5MergeColset(pNear->pColset, pColset);
+ if( pNear->pColset->nCol==0 ){
+ pNode->eType = FTS5_EOF;
+ pNode->xNext = 0;
+ }
+ }else if( *ppFree ){
+ pNear->pColset = pColset;
+ *ppFree = 0;
+ }else{
+ pNear->pColset = fts5CloneColset(&pParse->rc, pColset);
+ }
+ }else{
+ int i;
+ assert( pNode->eType!=FTS5_EOF || pNode->nChild==0 );
+ for(i=0; inChild; i++){
+ fts5ParseSetColset(pParse, pNode->apChild[i], pColset, ppFree);
+ }
+ }
+ }
+}
+
+/*
+** Apply colset pColset to expression node pExpr and all of its descendents.
+*/
void sqlite3Fts5ParseSetColset(
Fts5Parse *pParse,
- Fts5ExprNearset *pNear,
+ Fts5ExprNode *pExpr,
Fts5Colset *pColset
){
+ Fts5Colset *pFree = pColset;
if( pParse->pConfig->eDetail==FTS5_DETAIL_NONE ){
pParse->rc = SQLITE_ERROR;
pParse->zErr = sqlite3_mprintf(
"fts5: column queries are not supported (detail=none)"
);
- sqlite3_free(pColset);
- return;
- }
-
- if( pNear ){
- pNear->pColset = pColset;
}else{
- sqlite3_free(pColset);
+ fts5ParseSetColset(pParse, pExpr, pColset, &pFree);
}
+ sqlite3_free(pFree);
}
static void fts5ExprAssignXNext(Fts5ExprNode *pNode){
@@ -2331,7 +2456,7 @@ static void fts5ExprFunction(
rc = sqlite3Fts5ConfigParse(pGlobal, db, nConfig, azConfig, &pConfig, &zErr);
if( rc==SQLITE_OK ){
- rc = sqlite3Fts5ExprNew(pConfig, zExpr, &pExpr, &zErr);
+ rc = sqlite3Fts5ExprNew(pConfig, pConfig->nCol, zExpr, &pExpr, &zErr);
}
if( rc==SQLITE_OK ){
char *zText;
diff --git a/ext/fts5/fts5_hash.c b/ext/fts5/fts5_hash.c
index afa2a30..1757061 100644
--- a/ext/fts5/fts5_hash.c
+++ b/ext/fts5/fts5_hash.c
@@ -36,9 +36,10 @@ struct Fts5Hash {
/*
** Each entry in the hash table is represented by an object of the
-** following type. Each object, its key (zKey[]) and its current data
-** are stored in a single memory allocation. The position list data
-** immediately follows the key data in memory.
+** following type. Each object, its key (a nul-terminated string) and
+** its current data are stored in a single memory allocation. The
+** key immediately follows the object in memory. The position list
+** data immediately follows the key data in memory.
**
** The data that follows the key is in a similar, but not identical format
** to the doclist data stored in the database. It is:
@@ -62,20 +63,20 @@ struct Fts5HashEntry {
int nAlloc; /* Total size of allocation */
int iSzPoslist; /* Offset of space for 4-byte poslist size */
int nData; /* Total bytes of data (incl. structure) */
- int nKey; /* Length of zKey[] in bytes */
+ int nKey; /* Length of key in bytes */
u8 bDel; /* Set delete-flag @ iSzPoslist */
u8 bContent; /* Set content-flag (detail=none mode) */
i16 iCol; /* Column of last value written */
int iPos; /* Position of last value written */
i64 iRowid; /* Rowid of last value written */
- char zKey[8]; /* Nul-terminated entry key */
};
/*
-** Size of Fts5HashEntry without the zKey[] array.
+** Eqivalent to:
+**
+** char *fts5EntryKey(Fts5HashEntry *pEntry){ return zKey; }
*/
-#define FTS5_HASHENTRYSIZE (sizeof(Fts5HashEntry)-8)
-
+#define fts5EntryKey(p) ( ((char *)(&(p)[1])) )
/*
@@ -170,10 +171,11 @@ static int fts5HashResize(Fts5Hash *pHash){
for(i=0; inSlot; i++){
while( apOld[i] ){
- int iHash;
+ unsigned int iHash;
Fts5HashEntry *p = apOld[i];
apOld[i] = p->pHashNext;
- iHash = fts5HashKey(nNew, (u8*)p->zKey, (int)strlen(p->zKey));
+ iHash = fts5HashKey(nNew, (u8*)fts5EntryKey(p),
+ (int)strlen(fts5EntryKey(p)));
p->pHashNext = apNew[iHash];
apNew[iHash] = p;
}
@@ -244,9 +246,10 @@ int sqlite3Fts5HashWrite(
/* Attempt to locate an existing hash entry */
iHash = fts5HashKey2(pHash->nSlot, (u8)bByte, (const u8*)pToken, nToken);
for(p=pHash->aSlot[iHash]; p; p=p->pHashNext){
- if( p->zKey[0]==bByte
+ char *zKey = fts5EntryKey(p);
+ if( zKey[0]==bByte
&& p->nKey==nToken
- && memcmp(&p->zKey[1], pToken, nToken)==0
+ && memcmp(&zKey[1], pToken, nToken)==0
){
break;
}
@@ -255,7 +258,8 @@ int sqlite3Fts5HashWrite(
/* If an existing hash entry cannot be found, create a new one. */
if( p==0 ){
/* Figure out how much space to allocate */
- int nByte = FTS5_HASHENTRYSIZE + (nToken+1) + 1 + 64;
+ char *zKey;
+ int nByte = sizeof(Fts5HashEntry) + (nToken+1) + 1 + 64;
if( nByte<128 ) nByte = 128;
/* Grow the Fts5Hash.aSlot[] array if necessary. */
@@ -268,14 +272,15 @@ int sqlite3Fts5HashWrite(
/* Allocate new Fts5HashEntry and add it to the hash table. */
p = (Fts5HashEntry*)sqlite3_malloc(nByte);
if( !p ) return SQLITE_NOMEM;
- memset(p, 0, FTS5_HASHENTRYSIZE);
+ memset(p, 0, sizeof(Fts5HashEntry));
p->nAlloc = nByte;
- p->zKey[0] = bByte;
- memcpy(&p->zKey[1], pToken, nToken);
- assert( iHash==fts5HashKey(pHash->nSlot, (u8*)p->zKey, nToken+1) );
+ zKey = fts5EntryKey(p);
+ zKey[0] = bByte;
+ memcpy(&zKey[1], pToken, nToken);
+ assert( iHash==fts5HashKey(pHash->nSlot, (u8*)zKey, nToken+1) );
p->nKey = nToken;
- p->zKey[nToken+1] = '\0';
- p->nData = nToken+1 + 1 + FTS5_HASHENTRYSIZE;
+ zKey[nToken+1] = '\0';
+ p->nData = nToken+1 + 1 + sizeof(Fts5HashEntry);
p->pHashNext = pHash->aSlot[iHash];
pHash->aSlot[iHash] = p;
pHash->nEntry++;
@@ -393,9 +398,11 @@ static Fts5HashEntry *fts5HashEntryMerge(
p1 = 0;
}else{
int i = 0;
- while( p1->zKey[i]==p2->zKey[i] ) i++;
+ char *zKey1 = fts5EntryKey(p1);
+ char *zKey2 = fts5EntryKey(p2);
+ while( zKey1[i]==zKey2[i] ) i++;
- if( ((u8)p1->zKey[i])>((u8)p2->zKey[i]) ){
+ if( ((u8)zKey1[i])>((u8)zKey2[i]) ){
/* p2 is smaller */
*ppOut = p2;
ppOut = &p2->pScanNext;
@@ -438,7 +445,7 @@ static int fts5HashEntrySort(
for(iSlot=0; iSlotnSlot; iSlot++){
Fts5HashEntry *pIter;
for(pIter=pHash->aSlot[iSlot]; pIter; pIter=pIter->pHashNext){
- if( pTerm==0 || 0==memcmp(pIter->zKey, pTerm, nTerm) ){
+ if( pTerm==0 || 0==memcmp(fts5EntryKey(pIter), pTerm, nTerm) ){
Fts5HashEntry *pEntry = pIter;
pEntry->pScanNext = 0;
for(i=0; ap[i]; i++){
@@ -471,16 +478,18 @@ int sqlite3Fts5HashQuery(
int *pnDoclist /* OUT: Size of doclist in bytes */
){
unsigned int iHash = fts5HashKey(pHash->nSlot, (const u8*)pTerm, nTerm);
+ char *zKey = 0;
Fts5HashEntry *p;
for(p=pHash->aSlot[iHash]; p; p=p->pHashNext){
- if( memcmp(p->zKey, pTerm, nTerm)==0 && p->zKey[nTerm]==0 ) break;
+ zKey = fts5EntryKey(p);
+ if( memcmp(zKey, pTerm, nTerm)==0 && zKey[nTerm]==0 ) break;
}
if( p ){
fts5HashAddPoslistSize(pHash, p);
- *ppDoclist = (const u8*)&p->zKey[nTerm+1];
- *pnDoclist = p->nData - (FTS5_HASHENTRYSIZE + nTerm + 1);
+ *ppDoclist = (const u8*)&zKey[nTerm+1];
+ *pnDoclist = p->nData - (sizeof(Fts5HashEntry) + nTerm + 1);
}else{
*ppDoclist = 0;
*pnDoclist = 0;
@@ -513,11 +522,12 @@ void sqlite3Fts5HashScanEntry(
){
Fts5HashEntry *p;
if( (p = pHash->pScan) ){
- int nTerm = (int)strlen(p->zKey);
+ char *zKey = fts5EntryKey(p);
+ int nTerm = (int)strlen(zKey);
fts5HashAddPoslistSize(pHash, p);
- *pzTerm = p->zKey;
- *ppDoclist = (const u8*)&p->zKey[nTerm+1];
- *pnDoclist = p->nData - (FTS5_HASHENTRYSIZE + nTerm + 1);
+ *pzTerm = zKey;
+ *ppDoclist = (const u8*)&zKey[nTerm+1];
+ *pnDoclist = p->nData - (sizeof(Fts5HashEntry) + nTerm + 1);
}else{
*pzTerm = 0;
*ppDoclist = 0;
diff --git a/ext/fts5/fts5_index.c b/ext/fts5/fts5_index.c
index 46517e1..c941228 100644
--- a/ext/fts5/fts5_index.c
+++ b/ext/fts5/fts5_index.c
@@ -628,7 +628,6 @@ static void fts5CloseReader(Fts5Index *p){
}
}
-
/*
** Retrieve a record from the %_data table.
**
@@ -729,7 +728,8 @@ static int fts5IndexPrepareStmt(
){
if( p->rc==SQLITE_OK ){
if( zSql ){
- p->rc = sqlite3_prepare_v2(p->pConfig->db, zSql, -1, ppStmt, 0);
+ p->rc = sqlite3_prepare_v3(p->pConfig->db, zSql, -1,
+ SQLITE_PREPARE_PERSISTENT, ppStmt, 0);
}else{
p->rc = SQLITE_NOMEM;
}
@@ -778,7 +778,8 @@ static void fts5DataDelete(Fts5Index *p, i64 iFirst, i64 iLast){
if( zSql==0 ){
rc = SQLITE_NOMEM;
}else{
- rc = sqlite3_prepare_v2(pConfig->db, zSql, -1, &p->pDeleter, 0);
+ rc = sqlite3_prepare_v3(pConfig->db, zSql, -1,
+ SQLITE_PREPARE_PERSISTENT, &p->pDeleter, 0);
sqlite3_free(zSql);
}
if( rc!=SQLITE_OK ){
@@ -2039,7 +2040,7 @@ static void fts5SegIterNext(
else if( pLeaf->nn>pLeaf->szLeaf ){
pIter->iPgidxOff = pLeaf->szLeaf + fts5GetVarint32(
&pLeaf->p[pLeaf->szLeaf], iOff
- );
+ );
pIter->iLeafOffset = iOff;
pIter->iEndofDoclist = iOff;
bNewTerm = 1;
@@ -2073,6 +2074,7 @@ static void fts5SegIterNext(
*/
int nSz;
assert( p->rc==SQLITE_OK );
+ assert( pIter->iLeafOffset<=pIter->pLeaf->nn );
fts5FastGetVarint32(pIter->pLeaf->p, pIter->iLeafOffset, nSz);
pIter->bDel = (nSz & 0x0001);
pIter->nPos = nSz>>1;
@@ -2878,7 +2880,8 @@ static void fts5MultiIterNext2(
){
assert( pIter->bSkipEmpty );
if( p->rc==SQLITE_OK ){
- do {
+ *pbNewTerm = 0;
+ do{
int iFirst = pIter->aFirst[1].iFirst;
Fts5SegIter *pSeg = &pIter->aSeg[iFirst];
int bNewTerm = 0;
@@ -2891,8 +2894,6 @@ static void fts5MultiIterNext2(
fts5MultiIterAdvanced(p, pIter, iFirst, 1);
fts5MultiIterSetEof(pIter);
*pbNewTerm = 1;
- }else{
- *pbNewTerm = 0;
}
fts5AssertMultiIterSetup(p, pIter);
@@ -3067,7 +3068,7 @@ static void fts5ChunkIterate(
break;
}else{
pgno++;
- pData = fts5DataRead(p, FTS5_SEGMENT_ROWID(pSeg->pSeg->iSegid, pgno));
+ pData = fts5LeafRead(p, FTS5_SEGMENT_ROWID(pSeg->pSeg->iSegid, pgno));
if( pData==0 ) break;
pChunk = &pData->p[4];
nChunk = MIN(nRem, pData->szLeaf - 4);
@@ -3158,23 +3159,23 @@ static int fts5IndexExtractCol(
return p - (*pa);
}
-static int fts5IndexExtractColset (
+static void fts5IndexExtractColset(
+ int *pRc,
Fts5Colset *pColset, /* Colset to filter on */
const u8 *pPos, int nPos, /* Position list */
Fts5Buffer *pBuf /* Output buffer */
){
- int rc = SQLITE_OK;
- int i;
-
- fts5BufferZero(pBuf);
- for(i=0; inCol; i++){
- const u8 *pSub = pPos;
- int nSub = fts5IndexExtractCol(&pSub, nPos, pColset->aiCol[i]);
- if( nSub ){
- fts5BufferAppendBlob(&rc, pBuf, nSub, pSub);
+ if( *pRc==SQLITE_OK ){
+ int i;
+ fts5BufferZero(pBuf);
+ for(i=0; inCol; i++){
+ const u8 *pSub = pPos;
+ int nSub = fts5IndexExtractCol(&pSub, nPos, pColset->aiCol[i]);
+ if( nSub ){
+ fts5BufferAppendBlob(pRc, pBuf, nSub, pSub);
+ }
}
}
- return rc;
}
/*
@@ -3298,8 +3299,9 @@ static void fts5IterSetOutputs_Full(Fts5Iter *pIter, Fts5SegIter *pSeg){
pIter->base.nData = fts5IndexExtractCol(&a, pSeg->nPos,pColset->aiCol[0]);
pIter->base.pData = a;
}else{
+ int *pRc = &pIter->pIndex->rc;
fts5BufferZero(&pIter->poslist);
- fts5IndexExtractColset(pColset, a, pSeg->nPos, &pIter->poslist);
+ fts5IndexExtractColset(pRc, pColset, a, pSeg->nPos, &pIter->poslist);
pIter->base.pData = pIter->poslist.p;
pIter->base.nData = pIter->poslist.n;
}
@@ -3844,9 +3846,6 @@ static void fts5WriteFlushLeaf(Fts5Index *p, Fts5SegWriter *pWriter){
Fts5PageWriter *pPage = &pWriter->writer;
i64 iRowid;
-static int nCall = 0;
-nCall++;
-
assert( (pPage->pgidx.n==0)==(pWriter->bFirstTermInPage) );
/* Set the szLeaf header field. */
@@ -4195,6 +4194,7 @@ static void fts5IndexMergeLevel(
int bOldest; /* True if the output segment is the oldest */
int eDetail = p->pConfig->eDetail;
const int flags = FTS5INDEX_QUERY_NOOUTPUT;
+ int bTermWritten = 0; /* True if current term already output */
assert( iLvlnLevel );
assert( pLvl->nMerge<=pLvl->nSeg );
@@ -4248,18 +4248,22 @@ static void fts5IndexMergeLevel(
int nTerm;
const u8 *pTerm;
- /* Check for key annihilation. */
- if( pSegIter->nPos==0 && (bOldest || pSegIter->bDel==0) ) continue;
-
pTerm = fts5MultiIterTerm(pIter, &nTerm);
if( nTerm!=term.n || memcmp(pTerm, term.p, nTerm) ){
if( pnRem && writer.nLeafWritten>nRem ){
break;
}
+ fts5BufferSet(&p->rc, &term, nTerm, pTerm);
+ bTermWritten =0;
+ }
+ /* Check for key annihilation. */
+ if( pSegIter->nPos==0 && (bOldest || pSegIter->bDel==0) ) continue;
+
+ if( p->rc==SQLITE_OK && bTermWritten==0 ){
/* This is a new term. Append a term to the output segment. */
fts5WriteAppendTerm(p, &writer, nTerm, pTerm);
- fts5BufferSet(&p->rc, &term, nTerm, pTerm);
+ bTermWritten = 1;
}
/* Append the rowid to the output */
@@ -5091,7 +5095,7 @@ static void fts5SetupPrefixIter(
if( pData ){
pData->p = (u8*)&pData[1];
pData->nn = pData->szLeaf = doclist.n;
- memcpy(pData->p, doclist.p, doclist.n);
+ if( doclist.n ) memcpy(pData->p, doclist.p, doclist.n);
fts5MultiIterNew2(p, pData, bDesc, ppIter);
}
fts5BufferFree(&doclist);
@@ -5130,10 +5134,10 @@ int sqlite3Fts5IndexBeginWrite(Fts5Index *p, int bDelete, i64 iRowid){
/*
** Commit data to disk.
*/
-int sqlite3Fts5IndexSync(Fts5Index *p, int bCommit){
+int sqlite3Fts5IndexSync(Fts5Index *p){
assert( p->rc==SQLITE_OK );
fts5IndexFlush(p);
- if( bCommit ) fts5CloseReader(p);
+ fts5CloseReader(p);
return fts5IndexReturn(p);
}
@@ -5330,7 +5334,7 @@ int sqlite3Fts5IndexQuery(
if( sqlite3Fts5BufferSize(&p->rc, &buf, nToken+1)==0 ){
int iIdx = 0; /* Index to search */
- memcpy(&buf.p[1], pToken, nToken);
+ if( nToken ) memcpy(&buf.p[1], pToken, nToken);
/* Figure out which index to search and set iIdx accordingly. If this
** is a prefix query for which there is no prefix index, set iIdx to
@@ -5379,7 +5383,7 @@ int sqlite3Fts5IndexQuery(
}
if( p->rc ){
- sqlite3Fts5IterClose(&pRet->base);
+ sqlite3Fts5IterClose((Fts5IndexIter*)pRet);
pRet = 0;
fts5CloseReader(p);
}
@@ -5829,7 +5833,7 @@ static void fts5IndexIntegrityCheckSegment(
** ignore this b-tree entry. Otherwise, load it into memory. */
if( iIdxLeafpgnoFirst ) continue;
iRow = FTS5_SEGMENT_ROWID(pSeg->iSegid, iIdxLeaf);
- pLeaf = fts5DataRead(p, iRow);
+ pLeaf = fts5LeafRead(p, iRow);
if( pLeaf==0 ) break;
/* Check that the leaf contains at least one term, and that it is equal
diff --git a/ext/fts5/fts5_main.c b/ext/fts5/fts5_main.c
index 384d3dd..d59cd5b 100644
--- a/ext/fts5/fts5_main.c
+++ b/ext/fts5/fts5_main.c
@@ -506,6 +506,7 @@ static void fts5SetUniqueFlag(sqlite3_index_info *pIdxInfo){
static int fts5BestIndexMethod(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
Fts5Table *pTab = (Fts5Table*)pVTab;
Fts5Config *pConfig = pTab->pConfig;
+ const int nCol = pConfig->nCol;
int idxFlags = 0; /* Parameter passed through to xFilter() */
int bHasMatch;
int iNext;
@@ -531,24 +532,34 @@ static int fts5BestIndexMethod(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
int aColMap[3];
aColMap[0] = -1;
- aColMap[1] = pConfig->nCol;
- aColMap[2] = pConfig->nCol+1;
+ aColMap[1] = nCol;
+ aColMap[2] = nCol+1;
/* Set idxFlags flags for all WHERE clause terms that will be used. */
for(i=0; inConstraint; i++){
struct sqlite3_index_constraint *p = &pInfo->aConstraint[i];
- int j;
- for(j=0; jiColumn==aColMap[pC->iCol] && p->op & pC->op ){
- if( p->usable ){
+ int iCol = p->iColumn;
+
+ if( (p->op==SQLITE_INDEX_CONSTRAINT_MATCH && iCol>=0 && iCol<=nCol)
+ || (p->op==SQLITE_INDEX_CONSTRAINT_EQ && iCol==nCol)
+ ){
+ /* A MATCH operator or equivalent */
+ if( p->usable ){
+ idxFlags = (idxFlags & 0xFFFF) | FTS5_BI_MATCH | (iCol << 16);
+ aConstraint[0].iConsIndex = i;
+ }else{
+ /* As there exists an unusable MATCH constraint this is an
+ ** unusable plan. Set a prohibitively high cost. */
+ pInfo->estimatedCost = 1e50;
+ return SQLITE_OK;
+ }
+ }else{
+ int j;
+ for(j=1; jiCol] && p->op & pC->op && p->usable ){
pC->iConsIndex = i;
idxFlags |= pC->fts5op;
- }else if( j==0 ){
- /* As there exists an unusable MATCH constraint this is an
- ** unusable plan. Set a prohibitively high cost. */
- pInfo->estimatedCost = 1e50;
- return SQLITE_OK;
}
}
}
@@ -872,7 +883,8 @@ static int fts5PrepareStatement(
if( zSql==0 ){
rc = SQLITE_NOMEM;
}else{
- rc = sqlite3_prepare_v2(pConfig->db, zSql, -1, &pRet, 0);
+ rc = sqlite3_prepare_v3(pConfig->db, zSql, -1,
+ SQLITE_PREPARE_PERSISTENT, &pRet, 0);
if( rc!=SQLITE_OK ){
*pConfig->pzErrmsg = sqlite3_mprintf("%s", sqlite3_errmsg(pConfig->db));
}
@@ -1008,7 +1020,8 @@ static int fts5FindRankFunction(Fts5Cursor *pCsr){
char *zSql = sqlite3Fts5Mprintf(&rc, "SELECT %s", zRankArgs);
if( zSql ){
sqlite3_stmt *pStmt = 0;
- rc = sqlite3_prepare_v2(pConfig->db, zSql, -1, &pStmt, 0);
+ rc = sqlite3_prepare_v3(pConfig->db, zSql, -1,
+ SQLITE_PREPARE_PERSISTENT, &pStmt, 0);
sqlite3_free(zSql);
assert( rc==SQLITE_OK || pCsr->pRankArgStmt==0 );
if( rc==SQLITE_OK ){
@@ -1123,6 +1136,7 @@ static int fts5FilterMethod(
sqlite3_value *pRowidEq = 0; /* rowid = ? expression (or NULL) */
sqlite3_value *pRowidLe = 0; /* rowid <= ? expression (or NULL) */
sqlite3_value *pRowidGe = 0; /* rowid >= ? expression (or NULL) */
+ int iCol; /* Column on LHS of MATCH operator */
char **pzErrmsg = pConfig->pzErrmsg;
UNUSED_PARAM(zUnused);
@@ -1153,6 +1167,8 @@ static int fts5FilterMethod(
if( BitFlagTest(idxNum, FTS5_BI_ROWID_EQ) ) pRowidEq = apVal[iVal++];
if( BitFlagTest(idxNum, FTS5_BI_ROWID_LE) ) pRowidLe = apVal[iVal++];
if( BitFlagTest(idxNum, FTS5_BI_ROWID_GE) ) pRowidGe = apVal[iVal++];
+ iCol = (idxNum>>16);
+ assert( iCol>=0 && iCol<=pConfig->nCol );
assert( iVal==nVal );
bOrderByRank = ((idxNum & FTS5_BI_ORDER_RANK) ? 1 : 0);
pCsr->bDesc = bDesc = ((idxNum & FTS5_BI_ORDER_DESC) ? 1 : 0);
@@ -1199,7 +1215,7 @@ static int fts5FilterMethod(
rc = fts5SpecialMatch(pTab, pCsr, &zExpr[1]);
}else{
char **pzErr = &pTab->base.zErrMsg;
- rc = sqlite3Fts5ExprNew(pConfig, zExpr, &pCsr->pExpr, pzErr);
+ rc = sqlite3Fts5ExprNew(pConfig, iCol, zExpr, &pCsr->pExpr, pzErr);
if( rc==SQLITE_OK ){
if( bOrderByRank ){
pCsr->ePlan = FTS5_PLAN_SORTED_MATCH;
@@ -1579,7 +1595,7 @@ static int fts5SyncMethod(sqlite3_vtab *pVtab){
fts5CheckTransactionState(pTab, FTS5_SYNC, 0);
pTab->pConfig->pzErrmsg = &pTab->base.zErrMsg;
fts5TripCursors(pTab);
- rc = sqlite3Fts5StorageSync(pTab->pStorage, 1);
+ rc = sqlite3Fts5StorageSync(pTab->pStorage);
pTab->pConfig->pzErrmsg = 0;
return rc;
}
@@ -2390,7 +2406,7 @@ static int fts5SavepointMethod(sqlite3_vtab *pVtab, int iSavepoint){
UNUSED_PARAM(iSavepoint); /* Call below is a no-op for NDEBUG builds */
fts5CheckTransactionState(pTab, FTS5_SAVEPOINT, iSavepoint);
fts5TripCursors(pTab);
- return sqlite3Fts5StorageSync(pTab->pStorage, 0);
+ return sqlite3Fts5StorageSync(pTab->pStorage);
}
/*
@@ -2403,7 +2419,7 @@ static int fts5ReleaseMethod(sqlite3_vtab *pVtab, int iSavepoint){
UNUSED_PARAM(iSavepoint); /* Call below is a no-op for NDEBUG builds */
fts5CheckTransactionState(pTab, FTS5_RELEASE, iSavepoint);
fts5TripCursors(pTab);
- return sqlite3Fts5StorageSync(pTab->pStorage, 0);
+ return sqlite3Fts5StorageSync(pTab->pStorage);
}
/*
@@ -2593,15 +2609,14 @@ static void fts5ModuleDestroy(void *pCtx){
static void fts5Fts5Func(
sqlite3_context *pCtx, /* Function call context */
int nArg, /* Number of args */
- sqlite3_value **apUnused /* Function arguments */
+ sqlite3_value **apArg /* Function arguments */
){
Fts5Global *pGlobal = (Fts5Global*)sqlite3_user_data(pCtx);
- char buf[8];
- UNUSED_PARAM2(nArg, apUnused);
- assert( nArg==0 );
- assert( sizeof(buf)>=sizeof(pGlobal) );
- memcpy(buf, (void*)&pGlobal, sizeof(pGlobal));
- sqlite3_result_blob(pCtx, buf, sizeof(pGlobal), SQLITE_TRANSIENT);
+ fts5_api **ppApi;
+ UNUSED_PARAM(nArg);
+ assert( nArg==1 );
+ ppApi = (fts5_api**)sqlite3_value_pointer(apArg[0], "fts5_api_ptr");
+ if( ppApi ) *ppApi = &pGlobal->api;
}
/*
@@ -2666,7 +2681,7 @@ static int fts5Init(sqlite3 *db){
if( rc==SQLITE_OK ) rc = sqlite3Fts5VocabInit(pGlobal, db);
if( rc==SQLITE_OK ){
rc = sqlite3_create_function(
- db, "fts5", 0, SQLITE_UTF8, p, fts5Fts5Func, 0, 0
+ db, "fts5", 1, SQLITE_UTF8, p, fts5Fts5Func, 0, 0
);
}
if( rc==SQLITE_OK ){
diff --git a/ext/fts5/fts5_storage.c b/ext/fts5/fts5_storage.c
index a695887..59336fc 100644
--- a/ext/fts5/fts5_storage.c
+++ b/ext/fts5/fts5_storage.c
@@ -136,7 +136,8 @@ static int fts5StorageGetStmt(
if( zSql==0 ){
rc = SQLITE_NOMEM;
}else{
- rc = sqlite3_prepare_v2(pC->db, zSql, -1, &p->aStmt[eStmt], 0);
+ rc = sqlite3_prepare_v3(pC->db, zSql, -1,
+ SQLITE_PREPARE_PERSISTENT, &p->aStmt[eStmt], 0);
sqlite3_free(zSql);
if( rc!=SQLITE_OK && pzErrMsg ){
*pzErrMsg = sqlite3_mprintf("%s", sqlite3_errmsg(pC->db));
@@ -218,7 +219,7 @@ static void fts5StorageRenameOne(
int sqlite3Fts5StorageRename(Fts5Storage *pStorage, const char *zName){
Fts5Config *pConfig = pStorage->pConfig;
- int rc = sqlite3Fts5StorageSync(pStorage, 1);
+ int rc = sqlite3Fts5StorageSync(pStorage);
fts5StorageRenameOne(pConfig, &rc, "data", zName);
fts5StorageRenameOne(pConfig, &rc, "idx", zName);
@@ -545,11 +546,6 @@ int sqlite3Fts5StorageDelete(Fts5Storage *p, i64 iDel, sqlite3_value **apVal){
}
}
- /* Write the averages record */
- if( rc==SQLITE_OK ){
- rc = fts5StorageSaveTotals(p);
- }
-
return rc;
}
@@ -753,11 +749,6 @@ int sqlite3Fts5StorageIndexInsert(
}
sqlite3_free(buf.p);
- /* Write the averages record */
- if( rc==SQLITE_OK ){
- rc = fts5StorageSaveTotals(p);
- }
-
return rc;
}
@@ -1091,13 +1082,18 @@ int sqlite3Fts5StorageRowCount(Fts5Storage *p, i64 *pnRow){
/*
** Flush any data currently held in-memory to disk.
*/
-int sqlite3Fts5StorageSync(Fts5Storage *p, int bCommit){
- if( bCommit && p->bTotalsValid ){
- int rc = fts5StorageSaveTotals(p);
+int sqlite3Fts5StorageSync(Fts5Storage *p){
+ int rc = SQLITE_OK;
+ i64 iLastRowid = sqlite3_last_insert_rowid(p->pConfig->db);
+ if( p->bTotalsValid ){
+ rc = fts5StorageSaveTotals(p);
p->bTotalsValid = 0;
- if( rc!=SQLITE_OK ) return rc;
}
- return sqlite3Fts5IndexSync(p->pIndex, bCommit);
+ if( rc==SQLITE_OK ){
+ rc = sqlite3Fts5IndexSync(p->pIndex);
+ }
+ sqlite3_set_last_insert_rowid(p->pConfig->db, iLastRowid);
+ return rc;
}
int sqlite3Fts5StorageRollback(Fts5Storage *p){
diff --git a/ext/fts5/fts5_tcl.c b/ext/fts5/fts5_tcl.c
index 5fe690f..e8d4c32 100644
--- a/ext/fts5/fts5_tcl.c
+++ b/ext/fts5/fts5_tcl.c
@@ -99,16 +99,13 @@ static int SQLITE_TCLAPI f5tDbAndApi(
sqlite3_stmt *pStmt = 0;
fts5_api *pApi = 0;
- rc = sqlite3_prepare_v2(db, "SELECT fts5()", -1, &pStmt, 0);
+ rc = sqlite3_prepare_v2(db, "SELECT fts5(?1)", -1, &pStmt, 0);
if( rc!=SQLITE_OK ){
Tcl_AppendResult(interp, "error: ", sqlite3_errmsg(db), 0);
return TCL_ERROR;
}
-
- if( SQLITE_ROW==sqlite3_step(pStmt) ){
- const void *pPtr = sqlite3_column_blob(pStmt, 0);
- memcpy((void*)&pApi, pPtr, sizeof(pApi));
- }
+ sqlite3_bind_pointer(pStmt, 1, (void*)&pApi, "fts5_api_ptr", 0);
+ sqlite3_step(pStmt);
if( sqlite3_finalize(pStmt)!=SQLITE_OK ){
Tcl_AppendResult(interp, "error: ", sqlite3_errmsg(db), 0);
diff --git a/ext/fts5/fts5_test_mi.c b/ext/fts5/fts5_test_mi.c
index a905b85..481d09b 100644
--- a/ext/fts5/fts5_test_mi.c
+++ b/ext/fts5/fts5_test_mi.c
@@ -73,13 +73,10 @@ static int fts5_api_from_db(sqlite3 *db, fts5_api **ppApi){
int rc;
*ppApi = 0;
- rc = sqlite3_prepare(db, "SELECT fts5()", -1, &pStmt, 0);
+ rc = sqlite3_prepare(db, "SELECT fts5(?1)", -1, &pStmt, 0);
if( rc==SQLITE_OK ){
- if( SQLITE_ROW==sqlite3_step(pStmt)
- && sizeof(fts5_api*)==sqlite3_column_bytes(pStmt, 0)
- ){
- memcpy(ppApi, sqlite3_column_blob(pStmt, 0), sizeof(fts5_api*));
- }
+ sqlite3_bind_pointer(pStmt, 1, (void*)ppApi, "fts5_api_ptr", 0);
+ (void)sqlite3_step(pStmt);
rc = sqlite3_finalize(pStmt);
}
@@ -422,4 +419,3 @@ int sqlite3Fts5TestRegisterMatchinfo(sqlite3 *db){
}
#endif /* SQLITE_ENABLE_FTS5 */
-
diff --git a/ext/fts5/fts5_test_tok.c b/ext/fts5/fts5_test_tok.c
index 10af126..1818e16 100644
--- a/ext/fts5/fts5_test_tok.c
+++ b/ext/fts5/fts5_test_tok.c
@@ -40,7 +40,7 @@
*/
#if defined(SQLITE_TEST) && defined(SQLITE_ENABLE_FTS5)
-#include
+#include "fts5.h"
#include
#include
@@ -182,7 +182,7 @@ static int fts5tokConnectMethod(
Fts5tokTable *pTab = 0;
int rc;
char **azDequote = 0;
- int nDequote;
+ int nDequote = 0;
rc = sqlite3_declare_vtab(db,
"CREATE TABLE x(input HIDDEN, token, start, end, position)"
diff --git a/ext/fts5/fts5parse.y b/ext/fts5/fts5parse.y
index 1cc4b88..1582909 100644
--- a/ext/fts5/fts5parse.y
+++ b/ext/fts5/fts5parse.y
@@ -89,32 +89,6 @@ input ::= expr(X). { sqlite3Fts5ParseFinished(pParse, X); }
%destructor expr { sqlite3Fts5ParseNodeFree($$); }
%destructor exprlist { sqlite3Fts5ParseNodeFree($$); }
-expr(A) ::= expr(X) AND expr(Y). {
- A = sqlite3Fts5ParseNode(pParse, FTS5_AND, X, Y, 0);
-}
-expr(A) ::= expr(X) OR expr(Y). {
- A = sqlite3Fts5ParseNode(pParse, FTS5_OR, X, Y, 0);
-}
-expr(A) ::= expr(X) NOT expr(Y). {
- A = sqlite3Fts5ParseNode(pParse, FTS5_NOT, X, Y, 0);
-}
-
-expr(A) ::= LP expr(X) RP. {A = X;}
-expr(A) ::= exprlist(X). {A = X;}
-
-exprlist(A) ::= cnearset(X). {A = X;}
-exprlist(A) ::= exprlist(X) cnearset(Y). {
- A = sqlite3Fts5ParseImplicitAnd(pParse, X, Y);
-}
-
-cnearset(A) ::= nearset(X). {
- A = sqlite3Fts5ParseNode(pParse, FTS5_STRING, 0, 0, X);
-}
-cnearset(A) ::= colset(X) COLON nearset(Y). {
- sqlite3Fts5ParseSetColset(pParse, Y, X);
- A = sqlite3Fts5ParseNode(pParse, FTS5_STRING, 0, 0, Y);
-}
-
%type colset {Fts5Colset*}
%destructor colset { sqlite3_free($$); }
%type colsetlist {Fts5Colset*}
@@ -138,6 +112,37 @@ colsetlist(A) ::= STRING(X). {
A = sqlite3Fts5ParseColset(pParse, 0, &X);
}
+expr(A) ::= expr(X) AND expr(Y). {
+ A = sqlite3Fts5ParseNode(pParse, FTS5_AND, X, Y, 0);
+}
+expr(A) ::= expr(X) OR expr(Y). {
+ A = sqlite3Fts5ParseNode(pParse, FTS5_OR, X, Y, 0);
+}
+expr(A) ::= expr(X) NOT expr(Y). {
+ A = sqlite3Fts5ParseNode(pParse, FTS5_NOT, X, Y, 0);
+}
+
+expr(A) ::= colset(X) COLON LP expr(Y) RP. {
+ sqlite3Fts5ParseSetColset(pParse, Y, X);
+ A = Y;
+}
+expr(A) ::= LP expr(X) RP. {A = X;}
+expr(A) ::= exprlist(X). {A = X;}
+
+exprlist(A) ::= cnearset(X). {A = X;}
+exprlist(A) ::= exprlist(X) cnearset(Y). {
+ A = sqlite3Fts5ParseImplicitAnd(pParse, X, Y);
+}
+
+cnearset(A) ::= nearset(X). {
+ A = sqlite3Fts5ParseNode(pParse, FTS5_STRING, 0, 0, X);
+}
+cnearset(A) ::= colset(X) COLON nearset(Y). {
+ A = sqlite3Fts5ParseNode(pParse, FTS5_STRING, 0, 0, Y);
+ sqlite3Fts5ParseSetColset(pParse, A, X);
+}
+
+
%type nearset {Fts5ExprNearset*}
%type nearphrases {Fts5ExprNearset*}
%destructor nearset { sqlite3Fts5ParseNearsetFree($$); }
diff --git a/ext/fts5/test/fts5aa.test b/ext/fts5/test/fts5aa.test
index 428ca6c..a3ea0af 100644
--- a/ext/fts5/test/fts5aa.test
+++ b/ext/fts5/test/fts5aa.test
@@ -441,7 +441,7 @@ db func funk funk
do_catchsql_test 16.2 {
SELECT funk(), bm25(n1), funk() FROM n1 WHERE n1 MATCH 'a+b+c+d'
} {0 {{} -1e-06 {}}}
-# {1 {SQL logic error or missing database}}
+# {1 {SQL logic error}}
#-------------------------------------------------------------------------
#
@@ -561,9 +561,37 @@ do_test 20.1 {
execsql { SELECT rowid FROM tmp WHERE tmp MATCH 'y' }
} $::ids
+#--------------------------------------------------------------------
+# Test that a DROP TABLE may be executed within a transaction that
+# writes to an FTS5 table.
+#
+do_execsql_test 21.0 {
+ CREATE TEMP TABLE t8(a, b);
+ CREATE VIRTUAL TABLE ft USING fts5(x, detail=%DETAIL%);
+}
+
+do_execsql_test 21.1 {
+ BEGIN;
+ INSERT INTO ft VALUES('a b c');
+ DROP TABLE t8;
+ COMMIT;
+}
+
+do_execsql_test 22.0 {
+ CREATE VIRTUAL TABLE t9 USING fts5(x, detail=%DETAIL%);
+ INSERT INTO t9(rowid, x) VALUES(2, 'bbb');
+ BEGIN;
+ INSERT INTO t9(rowid, x) VALUES(1, 'aaa');
+ DELETE FROM t9 WHERE rowid = 2;
+ INSERT INTO t9(rowid, x) VALUES(3, 'bbb');
+ COMMIT;
+}
+
+do_execsql_test 22.1 {
+ SELECT rowid FROM t9('a*')
+} {1}
+
}
finish_test
-
-
diff --git a/ext/fts5/test/fts5ab.test b/ext/fts5/test/fts5ab.test
index 95da2cd..3979dd4 100644
--- a/ext/fts5/test/fts5ab.test
+++ b/ext/fts5/test/fts5ab.test
@@ -294,4 +294,3 @@ do_execsql_test 7.0 {
finish_test
-
diff --git a/ext/fts5/test/fts5ac.test b/ext/fts5/test/fts5ac.test
index 61b3230..f3a9146 100644
--- a/ext/fts5/test/fts5ac.test
+++ b/ext/fts5/test/fts5ac.test
@@ -276,4 +276,3 @@ foreach {tn expr tclexpr} {
}
finish_test
-
diff --git a/ext/fts5/test/fts5ad.test b/ext/fts5/test/fts5ad.test
index 974aa78..524da6d 100644
--- a/ext/fts5/test/fts5ad.test
+++ b/ext/fts5/test/fts5ad.test
@@ -231,7 +231,6 @@ foreach {T create} {
set res [lsort -integer -increasing $res]
}
set n [llength $res]
- if {$T==5} breakpoint
do_execsql_test $T.$bAsc.$tn.$n $sql $res
}
}
@@ -242,4 +241,3 @@ foreach {T create} {
}
finish_test
-
diff --git a/ext/fts5/test/fts5ae.test b/ext/fts5/test/fts5ae.test
index 5153306..d9f132c 100644
--- a/ext/fts5/test/fts5ae.test
+++ b/ext/fts5/test/fts5ae.test
@@ -309,4 +309,3 @@ foreach {tn q cnt} {
}
finish_test
-
diff --git a/ext/fts5/test/fts5af.test b/ext/fts5/test/fts5af.test
index 6aab55a..fa4ebd2 100644
--- a/ext/fts5/test/fts5af.test
+++ b/ext/fts5/test/fts5af.test
@@ -178,4 +178,3 @@ do_execsql_test 5.1 {
} ;# foreach_detail_mode
finish_test
-
diff --git a/ext/fts5/test/fts5ag.test b/ext/fts5/test/fts5ag.test
index de126a2..9ead957 100644
--- a/ext/fts5/test/fts5ag.test
+++ b/ext/fts5/test/fts5ag.test
@@ -142,4 +142,3 @@ if {[detail_is_full]} {
finish_test
-
diff --git a/ext/fts5/test/fts5ah.test b/ext/fts5/test/fts5ah.test
index b7beb56..24613f5 100644
--- a/ext/fts5/test/fts5ah.test
+++ b/ext/fts5/test/fts5ah.test
@@ -167,4 +167,3 @@ do_execsql_test 1.8.2 {
#db eval {SELECT rowid, fts5_decode(rowid, block) aS r FROM t1_data} {puts $r}
finish_test
-
diff --git a/ext/fts5/test/fts5ai.test b/ext/fts5/test/fts5ai.test
index e32c806..24e780a 100644
--- a/ext/fts5/test/fts5ai.test
+++ b/ext/fts5/test/fts5ai.test
@@ -55,4 +55,3 @@ do_execsql_test 1.2 {
finish_test
-
diff --git a/ext/fts5/test/fts5aj.test b/ext/fts5/test/fts5aj.test
index 6b9dddd..50dae20 100644
--- a/ext/fts5/test/fts5aj.test
+++ b/ext/fts5/test/fts5aj.test
@@ -66,4 +66,3 @@ do_execsql_test 2.0 { INSERT INTO t1(t1) VALUES('integrity-check') }
finish_test
-
diff --git a/ext/fts5/test/fts5ak.test b/ext/fts5/test/fts5ak.test
index 0f699a6..cab0ae2 100644
--- a/ext/fts5/test/fts5ak.test
+++ b/ext/fts5/test/fts5ak.test
@@ -147,4 +147,3 @@ do_execsql_test 3.1 {
}
finish_test
-
diff --git a/ext/fts5/test/fts5al.test b/ext/fts5/test/fts5al.test
index c0dd211..842d991 100644
--- a/ext/fts5/test/fts5al.test
+++ b/ext/fts5/test/fts5al.test
@@ -77,7 +77,7 @@ foreach {tn defn} {
} {
do_test 2.2.$tn {
catchsql { INSERT INTO ft1(ft1, rank) VALUES('rank', $defn) }
- } {1 {SQL logic error or missing database}}
+ } {1 {SQL logic error}}
}
#-------------------------------------------------------------------------
@@ -297,4 +297,3 @@ do_catchsql_test 4.4.4 {
finish_test
-
diff --git a/ext/fts5/test/fts5alter.test b/ext/fts5/test/fts5alter.test
index eae01b7..67f948c 100644
--- a/ext/fts5/test/fts5alter.test
+++ b/ext/fts5/test/fts5alter.test
@@ -89,7 +89,6 @@ do_execsql_test 3.1 {
BEGIN;
INSERT INTO abc(rowid, a) VALUES(2, 'a');
}
-breakpoint
do_execsql_test 3.2 {
SELECT rowid FROM abc WHERE abc MATCH 'a';
} {1 2}
@@ -100,4 +99,3 @@ do_execsql_test 3.3 {
} {1 2}
finish_test
-
diff --git a/ext/fts5/test/fts5auto.test b/ext/fts5/test/fts5auto.test
index 218b3f4..79d432b 100644
--- a/ext/fts5/test/fts5auto.test
+++ b/ext/fts5/test/fts5auto.test
@@ -342,4 +342,3 @@ foreach {tn expr} {
finish_test
-
diff --git a/ext/fts5/test/fts5aux.test b/ext/fts5/test/fts5aux.test
index fa3167a..0216b52 100644
--- a/ext/fts5/test/fts5aux.test
+++ b/ext/fts5/test/fts5aux.test
@@ -240,7 +240,6 @@ foreach {tn lRow res} {
} {
execsql { DELETE FROM x1 }
foreach row $lRow { execsql { INSERT INTO x1 VALUES($row) } }
- breakpoint
do_execsql_test 8.$tn {
SELECT highlight(x1, 0, '[', ']') FROM x1 WHERE x1 MATCH 'a OR (b AND d)';
} $res
@@ -279,4 +278,3 @@ do_execsql_test 9.3 {
finish_test
-
diff --git a/ext/fts5/test/fts5auxdata.test b/ext/fts5/test/fts5auxdata.test
index dbbb1db..a2a4170 100644
--- a/ext/fts5/test/fts5auxdata.test
+++ b/ext/fts5/test/fts5auxdata.test
@@ -112,4 +112,3 @@ db eval {
}
finish_test
-
diff --git a/ext/fts5/test/fts5bigpl.test b/ext/fts5/test/fts5bigpl.test
index 85f7460..2c9df11 100644
--- a/ext/fts5/test/fts5bigpl.test
+++ b/ext/fts5/test/fts5bigpl.test
@@ -61,4 +61,3 @@ do_test 2.1...slow {
} {}
finish_test
-
diff --git a/ext/fts5/test/fts5bigtok.test b/ext/fts5/test/fts5bigtok.test
index 9ccaf6c..f74ec8f 100644
--- a/ext/fts5/test/fts5bigtok.test
+++ b/ext/fts5/test/fts5bigtok.test
@@ -64,5 +64,3 @@ foreach_detail_mode $::testprefix {
}
finish_test
-
-
diff --git a/ext/fts5/test/fts5colset.test b/ext/fts5/test/fts5colset.test
index e333324..74f2300 100644
--- a/ext/fts5/test/fts5colset.test
+++ b/ext/fts5/test/fts5colset.test
@@ -44,16 +44,43 @@ foreach_detail_mode $::testprefix {
9 "-c : a" {1 2 4}
10 "-\"c\" : a" {1 2 4}
} {
- breakpoint
do_execsql_test 1.$tn {
SELECT rowid FROM t1($q)
} $res
}
+ foreach {tn q res} {
+ 0 {{a} : (a AND ":")} {}
+ 1 "{a b c} : (a AND d)" {2 3}
+ 2 "{a b c} : (a AND b:d)" {3}
+ 3 "{a b c} : (a AND d:d)" {}
+ 4 "{b} : ( {b a} : ( {c b a} : ( {d b c a} : ( d OR c ) ) ) )" {3 4}
+ 5 "{a} : ( {b a} : ( {c b a} : ( {d b c a} : ( d OR c ) ) ) )" {2 3}
+ 6 "{a} : ( {b a} : ( {c b} : ( {d b c a} : ( d OR c ) ) ) )" {}
+ 7 "{a b c} : (b:a AND c:b)" {2}
+ } {
+ do_execsql_test 2.$tn {
+ SELECT rowid FROM t1($q)
+ } $res
+ }
+ foreach {tn w res} {
+ 0 "a MATCH 'a'" {1}
+ 1 "b MATCH 'a'" {2}
+ 2 "b MATCH '{a b c} : a'" {2}
+ 3 "b MATCH 'a OR b'" {1 2}
+ 4 "b MATCH 'a OR a:b'" {2}
+ 5 "b MATCH 'a OR b:b'" {1 2}
+ } {
+ do_execsql_test 3.$tn "
+ SELECT rowid FROM t1 WHERE $w
+ " $res
+ }
+
+ do_catchsql_test 4.1 {
+ SELECT * FROM t1 WHERE rowid MATCH 'a'
+ } {1 {unable to use function MATCH in the requested context}}
}
finish_test
-
-
diff --git a/ext/fts5/test/fts5columnsize.test b/ext/fts5/test/fts5columnsize.test
index dec9b58..2b03d57 100644
--- a/ext/fts5/test/fts5columnsize.test
+++ b/ext/fts5/test/fts5columnsize.test
@@ -143,7 +143,6 @@ do_execsql_test 4.1.1 {
INSERT INTO t5 VALUES('2 4 6 8');
}
-breakpoint
do_execsql_test 4.1.2 {
INSERT INTO t5(t5) VALUES('integrity-check');
}
diff --git a/ext/fts5/test/fts5config.test b/ext/fts5/test/fts5config.test
index 386d112..35894c6 100644
--- a/ext/fts5/test/fts5config.test
+++ b/ext/fts5/test/fts5config.test
@@ -66,7 +66,7 @@ foreach {tn val} {
} {
do_catchsql_test 3.$tn {
INSERT INTO t1(t1, rank) VALUES('rank', $val);
- } {1 {SQL logic error or missing database}}
+ } {1 {SQL logic error}}
}
#-------------------------------------------------------------------------
@@ -110,7 +110,6 @@ do_catchsql_test 5.1 {
CREATE VIRTUAL TABLE xx USING fts5(x, tokenize="porter 'ascii");
} {1 {parse error in tokenize directive}}
-breakpoint
do_catchsql_test 5.2 {
CREATE VIRTUAL TABLE xx USING fts5(x, [y[]);
} {0 {}}
@@ -169,33 +168,33 @@ do_execsql_test 9.0 {
} {}
do_catchsql_test 9.1.1 {
INSERT INTO abc(abc, rank) VALUES('pgsz', -5);
-} {1 {SQL logic error or missing database}}
+} {1 {SQL logic error}}
do_catchsql_test 9.1.2 {
INSERT INTO abc(abc, rank) VALUES('pgsz', 50000000);
-} {1 {SQL logic error or missing database}}
+} {1 {SQL logic error}}
do_catchsql_test 9.1.3 {
INSERT INTO abc(abc, rank) VALUES('pgsz', 66.67);
-} {1 {SQL logic error or missing database}}
+} {1 {SQL logic error}}
do_catchsql_test 9.2.1 {
INSERT INTO abc(abc, rank) VALUES('automerge', -5);
-} {1 {SQL logic error or missing database}}
+} {1 {SQL logic error}}
do_catchsql_test 9.2.2 {
INSERT INTO abc(abc, rank) VALUES('automerge', 50000000);
-} {1 {SQL logic error or missing database}}
+} {1 {SQL logic error}}
do_catchsql_test 9.2.3 {
INSERT INTO abc(abc, rank) VALUES('automerge', 66.67);
-} {1 {SQL logic error or missing database}}
+} {1 {SQL logic error}}
do_execsql_test 9.2.4 {
INSERT INTO abc(abc, rank) VALUES('automerge', 1);
} {}
do_catchsql_test 9.3.1 {
INSERT INTO abc(abc, rank) VALUES('crisismerge', -5);
-} {1 {SQL logic error or missing database}}
+} {1 {SQL logic error}}
do_catchsql_test 9.3.2 {
INSERT INTO abc(abc, rank) VALUES('crisismerge', 66.67);
-} {1 {SQL logic error or missing database}}
+} {1 {SQL logic error}}
do_execsql_test 9.3.3 {
INSERT INTO abc(abc, rank) VALUES('crisismerge', 1);
} {}
@@ -205,14 +204,14 @@ do_execsql_test 9.3.4 {
do_catchsql_test 9.4.1 {
INSERT INTO abc(abc, rank) VALUES('nosuchoption', 1);
-} {1 {SQL logic error or missing database}}
+} {1 {SQL logic error}}
do_catchsql_test 9.5.1 {
INSERT INTO abc(abc, rank) VALUES('hashsize', 'not an integer');
-} {1 {SQL logic error or missing database}}
+} {1 {SQL logic error}}
do_catchsql_test 9.5.2 {
INSERT INTO abc(abc, rank) VALUES('hashsize', -500000);
-} {1 {SQL logic error or missing database}}
+} {1 {SQL logic error}}
do_catchsql_test 9.5.3 {
INSERT INTO abc(abc, rank) VALUES('hashsize', 500000);
} {0 {}}
@@ -245,7 +244,7 @@ foreach {tn opt} {
do_catchsql_test 12.1 {
INSERT INTO t1(t1, rank) VALUES('rank', NULL);;
-} {1 {SQL logic error or missing database}}
+} {1 {SQL logic error}}
#-------------------------------------------------------------------------
# errors in the 'usermerge' option
@@ -260,8 +259,7 @@ foreach {tn val} {
4 1
} {
set sql "INSERT INTO tt(tt, rank) VALUES('usermerge', $val)"
- do_catchsql_test 13.$tn $sql {1 {SQL logic error or missing database}}
+ do_catchsql_test 13.$tn $sql {1 {SQL logic error}}
}
finish_test
-
diff --git a/ext/fts5/test/fts5conflict.test b/ext/fts5/test/fts5conflict.test
index 5c1e593..644db53 100644
--- a/ext/fts5/test/fts5conflict.test
+++ b/ext/fts5/test/fts5conflict.test
@@ -66,5 +66,3 @@ do_execsql_test 2.1 {
}
finish_test
-
-
diff --git a/ext/fts5/test/fts5content.test b/ext/fts5/test/fts5content.test
index 69e66a5..a74b26d 100644
--- a/ext/fts5/test/fts5content.test
+++ b/ext/fts5/test/fts5content.test
@@ -255,4 +255,3 @@ do_execsql_test 6.2 {
finish_test
-
diff --git a/ext/fts5/test/fts5corrupt.test b/ext/fts5/test/fts5corrupt.test
index edaafb2..1462912 100644
--- a/ext/fts5/test/fts5corrupt.test
+++ b/ext/fts5/test/fts5corrupt.test
@@ -96,4 +96,3 @@ do_catchsql_test 3.1 {
} {1 {database disk image is malformed}}
finish_test
-
diff --git a/ext/fts5/test/fts5corrupt2.test b/ext/fts5/test/fts5corrupt2.test
index c10017a..74721e5 100644
--- a/ext/fts5/test/fts5corrupt2.test
+++ b/ext/fts5/test/fts5corrupt2.test
@@ -269,4 +269,3 @@ do_catchsql_test 6.2 {
sqlite3_fts5_may_be_corrupt 0
finish_test
-
diff --git a/ext/fts5/test/fts5corrupt3.test b/ext/fts5/test/fts5corrupt3.test
index 9653bca..72e9ab1 100644
--- a/ext/fts5/test/fts5corrupt3.test
+++ b/ext/fts5/test/fts5corrupt3.test
@@ -409,4 +409,3 @@ do_catchsql_test 9.2.2 {
sqlite3_fts5_may_be_corrupt 0
finish_test
-
diff --git a/ext/fts5/test/fts5delete.test b/ext/fts5/test/fts5delete.test
new file mode 100644
index 0000000..488ce02
--- /dev/null
+++ b/ext/fts5/test/fts5delete.test
@@ -0,0 +1,53 @@
+# 2017 May 12
+#
+# The author disclaims copyright to this source code. In place of
+# a legal notice, here is a blessing:
+#
+# May you do good and not evil.
+# May you find forgiveness for yourself and forgive others.
+# May you share freely, never taking more than you give.
+#
+#*************************************************************************
+# This file implements regression tests for SQLite library. The
+# focus of this script is testing the FTS5 module.
+#
+
+source [file join [file dirname [info script]] fts5_common.tcl]
+set testprefix fts5delete
+
+# If SQLITE_ENABLE_FTS5 is not defined, omit this file.
+ifcapable !fts5 {
+ finish_test
+ return
+}
+fts5_aux_test_functions db
+
+do_execsql_test 1.0 {
+ CREATE VIRTUAL TABLE t1 USING fts5(x);
+ WITH s(i) AS (
+ SELECT 1 UNION ALL SELECT i+1 FROM s WHERE i<5000
+ )
+ INSERT INTO t1(rowid, x) SELECT i, (i/2)*2 FROM s;
+}
+
+do_test 1.1 {
+ execsql BEGIN
+ for {set i 1} {$i<=5000} {incr i} {
+ if {$i % 2} {
+ execsql { INSERT INTO t1 VALUES($i) }
+ } else {
+ execsql { DELETE FROM t1 WHERE rowid = $i }
+ }
+ }
+ execsql COMMIT
+} {}
+
+do_test 1.2 {
+ execsql { INSERT INTO t1(t1, rank) VALUES('usermerge', 2); }
+ for {set i 0} {$i < 5} {incr i} {
+ execsql { INSERT INTO t1(t1, rank) VALUES('merge', 1) }
+ execsql { INSERT INTO t1(t1) VALUES('integrity-check') }
+ }
+} {}
+
+finish_test
diff --git a/ext/fts5/test/fts5detail.test b/ext/fts5/test/fts5detail.test
index 58fda3e..cf4b718 100644
--- a/ext/fts5/test/fts5detail.test
+++ b/ext/fts5/test/fts5detail.test
@@ -241,4 +241,3 @@ do_execsql_test 5.3 {
finish_test
-
diff --git a/ext/fts5/test/fts5determin.test b/ext/fts5/test/fts5determin.test
index e368c47..c9094ed 100644
--- a/ext/fts5/test/fts5determin.test
+++ b/ext/fts5/test/fts5determin.test
@@ -63,5 +63,3 @@ foreach_detail_mode $::testprefix {
finish_test
-
-
diff --git a/ext/fts5/test/fts5dlidx.test b/ext/fts5/test/fts5dlidx.test
index 655beb9..1fb95a9 100644
--- a/ext/fts5/test/fts5dlidx.test
+++ b/ext/fts5/test/fts5dlidx.test
@@ -66,7 +66,6 @@ proc do_dlidx_test1 {tn spc1 spc2 nEntry iFirst nStep} {
}
execsql COMMIT
- breakpoint
do_test $tn.1 {
execsql { INSERT INTO t1(t1) VALUES('integrity-check') }
} {}
@@ -124,7 +123,6 @@ proc do_dlidx_test2 {tn nEntry iFirst nStep} {
do_execsql_test $tn.1 {
SELECT rowid FROM t1 WHERE t1 MATCH 'b AND a'
} {1}
- breakpoint
do_execsql_test $tn.2 {
SELECT rowid FROM t1 WHERE t1 MATCH 'b AND a' ORDER BY rowid DESC
} {1}
@@ -197,4 +195,3 @@ foreach v $vocab {
finish_test
-
diff --git a/ext/fts5/test/fts5doclist.test b/ext/fts5/test/fts5doclist.test
index 411289a..d8308fd 100644
--- a/ext/fts5/test/fts5doclist.test
+++ b/ext/fts5/test/fts5doclist.test
@@ -44,4 +44,3 @@ do_execsql_test 1.2 {
finish_test
-
diff --git a/ext/fts5/test/fts5eb.test b/ext/fts5/test/fts5eb.test
index 510a0d7..dd66dec 100644
--- a/ext/fts5/test/fts5eb.test
+++ b/ext/fts5/test/fts5eb.test
@@ -81,6 +81,3 @@ do_execsql_test 3.3 {
finish_test
-
-
-
diff --git a/ext/fts5/test/fts5fault1.test b/ext/fts5/test/fts5fault1.test
index 9d63a11..9602513 100644
--- a/ext/fts5/test/fts5fault1.test
+++ b/ext/fts5/test/fts5fault1.test
@@ -351,4 +351,3 @@ do_faultsim_test 9.1 -faults oom-* -prep {
finish_test
-
diff --git a/ext/fts5/test/fts5fault2.test b/ext/fts5/test/fts5fault2.test
index 43c7c7a..aa21f7b 100644
--- a/ext/fts5/test/fts5fault2.test
+++ b/ext/fts5/test/fts5fault2.test
@@ -137,4 +137,3 @@ do_faultsim_test 5.0 -faults oom-* -prep {
}
finish_test
-
diff --git a/ext/fts5/test/fts5fault3.test b/ext/fts5/test/fts5fault3.test
index bfeead4..a294692 100644
--- a/ext/fts5/test/fts5fault3.test
+++ b/ext/fts5/test/fts5fault3.test
@@ -110,4 +110,3 @@ do_faultsim_test 3.2 -faults oom-* -prep {
finish_test
-
diff --git a/ext/fts5/test/fts5fault4.test b/ext/fts5/test/fts5fault4.test
index bfa54a5..877e022 100644
--- a/ext/fts5/test/fts5fault4.test
+++ b/ext/fts5/test/fts5fault4.test
@@ -395,4 +395,3 @@ do_faultsim_test 14.1 -faults oom-t* -prep {
}
finish_test
-
diff --git a/ext/fts5/test/fts5fault5.test b/ext/fts5/test/fts5fault5.test
index 75b7d9a..6b8dade 100644
--- a/ext/fts5/test/fts5fault5.test
+++ b/ext/fts5/test/fts5fault5.test
@@ -105,7 +105,6 @@ do_faultsim_test 3.2 -faults oom-t* -body {
faultsim_test_result {0 {1 10 11 12 13 14 15 16 17 18 19 2}}
}
-breakpoint
do_execsql_test 3.3.0 {
SELECT * FROM tv2;
} {
@@ -130,4 +129,3 @@ do_faultsim_test 3.3 -faults oom-t* -body {
finish_test
-
diff --git a/ext/fts5/test/fts5fault6.test b/ext/fts5/test/fts5fault6.test
index 118761f..1c1c9f2 100644
--- a/ext/fts5/test/fts5fault6.test
+++ b/ext/fts5/test/fts5fault6.test
@@ -280,7 +280,6 @@ do_faultsim_test 5.4 -faults oom* -prep {
#-------------------------------------------------------------------------
catch { db close }
-breakpoint
do_faultsim_test 6 -faults oom* -prep {
sqlite_orig db test.db
sqlite3_db_config_lookaside db 0 0 0
@@ -292,4 +291,3 @@ do_faultsim_test 6 -faults oom* -prep {
db close
}
finish_test
-
diff --git a/ext/fts5/test/fts5fault7.test b/ext/fts5/test/fts5fault7.test
index a35b19a..c93e099 100644
--- a/ext/fts5/test/fts5fault7.test
+++ b/ext/fts5/test/fts5fault7.test
@@ -116,4 +116,3 @@ do_faultsim_test 2.2 -faults oom-* -body {
}
finish_test
-
diff --git a/ext/fts5/test/fts5fault8.test b/ext/fts5/test/fts5fault8.test
index c613490..5afab77 100644
--- a/ext/fts5/test/fts5fault8.test
+++ b/ext/fts5/test/fts5fault8.test
@@ -82,4 +82,3 @@ do_faultsim_test 4 -faults oom-* -prep {
finish_test
-
diff --git a/ext/fts5/test/fts5fault9.test b/ext/fts5/test/fts5fault9.test
index 908a91d..1daa5c1 100644
--- a/ext/fts5/test/fts5fault9.test
+++ b/ext/fts5/test/fts5fault9.test
@@ -153,4 +153,3 @@ do_faultsim_test 6 -faults oom-* -body {
} ;# foreach_detail_mode...
finish_test
-
diff --git a/ext/fts5/test/fts5faultA.test b/ext/fts5/test/fts5faultA.test
index 817ccb4..212401f 100644
--- a/ext/fts5/test/fts5faultA.test
+++ b/ext/fts5/test/fts5faultA.test
@@ -61,4 +61,3 @@ do_faultsim_test 2 -faults oom* -prep {
faultsim_test_result {0 {1 2}}
}
finish_test
-
diff --git a/ext/fts5/test/fts5faultB.test b/ext/fts5/test/fts5faultB.test
index 40df8b6..a4fef52 100644
--- a/ext/fts5/test/fts5faultB.test
+++ b/ext/fts5/test/fts5faultB.test
@@ -78,6 +78,57 @@ do_faultsim_test 2.4 -faults oom* -body {
faultsim_test_result {0 {{3 2} {2 3}}}
}
+#-------------------------------------------------------------------------
+#
+reset_db
+do_execsql_test 3.0 {
+ CREATE VIRTUAL TABLE x1 USING fts5(z);
+}
+
+do_faultsim_test 3.1 -faults oom* -body {
+ execsql {
+ SELECT rowid FROM x1('c') WHERE rowid>1;
+ }
+} -test {
+ faultsim_test_result {0 {}}
+}
+
+do_execsql_test 3.2 {
+ INSERT INTO x1 VALUES('a b c');
+ INSERT INTO x1 VALUES('b c d');
+ INSERT INTO x1 VALUES('c d e');
+ INSERT INTO x1 VALUES('d e f');
+}
+do_faultsim_test 3.3 -faults oom* -body {
+ execsql {
+ SELECT rowid FROM x1('c') WHERE rowid>1;
+ }
+} -test {
+ faultsim_test_result {0 {2 3}}
+}
+
+#-------------------------------------------------------------------------
+# Test OOM injection with nested colsets.
+#
+reset_db
+do_execsql_test 4.0 {
+ CREATE VIRTUAL TABLE t1 USING fts5(a, b, c, d);
+ INSERT INTO t1 VALUES('a', 'b', 'c', 'd'); -- 1
+ INSERT INTO t1 VALUES('d', 'a', 'b', 'c'); -- 2
+ INSERT INTO t1 VALUES('c', 'd', 'a', 'b'); -- 3
+ INSERT INTO t1 VALUES('b', 'c', 'd', 'a'); -- 4
+}
+do_faultsim_test 4.1 -faults oom* -body {
+ execsql { SELECT rowid FROM t1('{a b c} : (b:a AND c:b)'); }
+} -test {
+ faultsim_test_result {0 2}
+}
+
+do_faultsim_test 4.2 -faults oom* -body {
+ execsql { SELECT rowid FROM t1('{a b c} : (a AND d)') }
+} -test {
+ faultsim_test_result {0 {2 3}}
+}
+
finish_test
-
diff --git a/ext/fts5/test/fts5faultD.test b/ext/fts5/test/fts5faultD.test
new file mode 100644
index 0000000..e259cbf
--- /dev/null
+++ b/ext/fts5/test/fts5faultD.test
@@ -0,0 +1,87 @@
+# 2016 February 2
+#
+# The author disclaims copyright to this source code. In place of
+# a legal notice, here is a blessing:
+#
+# May you do good and not evil.
+# May you find forgiveness for yourself and forgive others.
+# May you share freely, never taking more than you give.
+#
+#*************************************************************************
+#
+# This file is focused on OOM errors.
+#
+
+source [file join [file dirname [info script]] fts5_common.tcl]
+source $testdir/malloc_common.tcl
+set testprefix fts5faultA
+
+# If SQLITE_ENABLE_FTS3 is defined, omit this file.
+ifcapable !fts5 {
+ finish_test
+ return
+}
+
+foreach_detail_mode $testprefix {
+ if {"%DETAIL%"=="none"} continue
+
+ do_execsql_test 1.0 {
+ CREATE VIRTUAL TABLE o1 USING fts5(a, b, c, detail=%DETAIL%);
+ INSERT INTO o1(o1, rank) VALUES('pgsz', 32);
+
+ WITH s(i) AS ( SELECT 1 UNION ALL SELECT i+1 FROM s WHERE i<60 )
+ INSERT INTO o1 SELECT 'A', 'B', 'C' FROM s;
+
+ WITH s(i) AS ( SELECT 1 UNION ALL SELECT i+1 FROM s WHERE i<60 )
+ INSERT INTO o1 SELECT 'C', 'A', 'B' FROM s;
+
+ WITH s(i) AS ( SELECT 1 UNION ALL SELECT i+1 FROM s WHERE i<60 )
+ INSERT INTO o1 SELECT 'B', 'C', 'A' FROM s;
+ }
+
+ do_faultsim_test 1 -faults int* -prep {
+ sqlite3 db test.db
+ } -body {
+ execsql { SELECT count(*) FROM o1('a') }
+ } -test {
+ faultsim_test_result {0 180} {1 {vtable constructor failed: o1}}
+ }
+
+ do_faultsim_test 2 -faults int* -prep {
+ sqlite3 db test.db
+ } -body {
+ execsql { SELECT * FROM o1('a:a AND {b c}:b') ORDER BY rank }
+ expr 1
+ } -test {
+ faultsim_test_result {0 1} {1 {vtable constructor failed: o1}}
+ }
+
+ do_faultsim_test 3 -faults int* -prep {
+ sqlite3 db test.db
+ } -body {
+ execsql { SELECT * FROM o1('{b c}:b NOT a:a') ORDER BY rank }
+ expr 1
+ } -test {
+ faultsim_test_result {0 1} {1 {vtable constructor failed: o1}}
+ }
+
+ do_faultsim_test 4 -faults int* -prep {
+ sqlite3 db test.db
+ } -body {
+ execsql { SELECT * FROM o1('b:b OR a:a') }
+ expr 1
+ } -test {
+ faultsim_test_result {0 1} {1 {vtable constructor failed: o1}}
+ }
+
+ do_faultsim_test 5 -faults int* -prep {
+ sqlite3 db test.db
+ } -body {
+ execsql { SELECT count(*) FROM o1('c:b') }
+ expr 1
+ } -test {
+ faultsim_test_result {0 1} {1 {vtable constructor failed: o1}}
+ }
+}
+
+finish_test
diff --git a/ext/fts5/test/fts5full.test b/ext/fts5/test/fts5full.test
index c640f56..91ded37 100644
--- a/ext/fts5/test/fts5full.test
+++ b/ext/fts5/test/fts5full.test
@@ -40,4 +40,3 @@ do_test 1.1 {
finish_test
-
diff --git a/ext/fts5/test/fts5fuzz1.test b/ext/fts5/test/fts5fuzz1.test
index 599d7bc..52dc23d 100644
--- a/ext/fts5/test/fts5fuzz1.test
+++ b/ext/fts5/test/fts5fuzz1.test
@@ -90,4 +90,3 @@ do_catchsql_test 4.1 {
} {1 {fts5: syntax error near "`"}}
finish_test
-
diff --git a/ext/fts5/test/fts5hash.test b/ext/fts5/test/fts5hash.test
index f3952d6..38257db 100644
--- a/ext/fts5/test/fts5hash.test
+++ b/ext/fts5/test/fts5hash.test
@@ -121,7 +121,6 @@ foreach_detail_mode $testprefix {
}
execsql { CREATE VIRTUAL TABLE t2 USING fts5(x, detail=%DETAIL%) }
-breakpoint
execsql {
INSERT INTO t2 VALUES($small || ' ' || $big);
}
@@ -130,4 +129,3 @@ breakpoint
} ;# foreach_detail_mode
finish_test
-
diff --git a/ext/fts5/test/fts5integrity.test b/ext/fts5/test/fts5integrity.test
index 37ca933..a38b164 100644
--- a/ext/fts5/test/fts5integrity.test
+++ b/ext/fts5/test/fts5integrity.test
@@ -210,4 +210,3 @@ foreach {tn pgsz} {
}
finish_test
-
diff --git a/ext/fts5/test/fts5lastrowid.test b/ext/fts5/test/fts5lastrowid.test
new file mode 100644
index 0000000..d152a8f
--- /dev/null
+++ b/ext/fts5/test/fts5lastrowid.test
@@ -0,0 +1,72 @@
+# 2017 Feb 27
+#
+# The author disclaims copyright to this source code. In place of
+# a legal notice, here is a blessing:
+#
+# May you do good and not evil.
+# May you find forgiveness for yourself and forgive others.
+# May you share freely, never taking more than you give.
+#
+#***********************************************************************
+#
+# Tests of the last_insert_rowid functionality with fts5.
+#
+
+source [file join [file dirname [info script]] fts5_common.tcl]
+set testprefix fts5lastrowid
+
+# If SQLITE_ENABLE_FTS5 is defined, omit this file.
+ifcapable !fts5 {
+ finish_test
+ return
+}
+
+do_execsql_test 1.0 {
+ CREATE VIRTUAL TABLE t1 USING fts5(str);
+}
+
+do_execsql_test 1.1 {
+ INSERT INTO t1 VALUES('one string');
+ INSERT INTO t1 VALUES('two string');
+ INSERT INTO t1 VALUES('three string');
+ SELECT last_insert_rowid();
+} {3}
+
+do_execsql_test 1.2 {
+ BEGIN;
+ INSERT INTO t1 VALUES('one string');
+ INSERT INTO t1 VALUES('two string');
+ INSERT INTO t1 VALUES('three string');
+ COMMIT;
+ SELECT last_insert_rowid();
+} {6}
+
+do_execsql_test 1.3 {
+ INSERT INTO t1(rowid, str) VALUES(-22, 'some more text');
+ SELECT last_insert_rowid();
+} {-22}
+
+do_execsql_test 1.4 {
+ BEGIN;
+ INSERT INTO t1(rowid, str) VALUES(45, 'some more text');
+ INSERT INTO t1(rowid, str) VALUES(46, 'some more text');
+ INSERT INTO t1(rowid, str) VALUES(222, 'some more text');
+ SELECT last_insert_rowid();
+ COMMIT;
+ SELECT last_insert_rowid();
+} {222 222}
+
+do_execsql_test 1.5 {
+ CREATE TABLE x1(x);
+ INSERT INTO x1 VALUES('john'), ('paul'), ('george'), ('ringo');
+ INSERT INTO t1 SELECT x FROM x1;
+ SELECT last_insert_rowid();
+} {226}
+
+do_execsql_test 1.6 {
+ INSERT INTO t1(rowid, str) SELECT rowid+10, x FROM x1;
+ SELECT last_insert_rowid();
+} {14}
+
+
+finish_test
diff --git a/ext/fts5/test/fts5leftjoin.test b/ext/fts5/test/fts5leftjoin.test
new file mode 100644
index 0000000..4ef6a89
--- /dev/null
+++ b/ext/fts5/test/fts5leftjoin.test
@@ -0,0 +1,43 @@
+# 2014 June 17
+#
+# The author disclaims copyright to this source code. In place of
+# a legal notice, here is a blessing:
+#
+# May you do good and not evil.
+# May you find forgiveness for yourself and forgive others.
+# May you share freely, never taking more than you give.
+#
+#*************************************************************************
+# This file implements regression tests for SQLite library. The
+# focus of this script is testing the FTS5 module.
+#
+
+source [file join [file dirname [info script]] fts5_common.tcl]
+set testprefix fts5leftjoin
+
+# If SQLITE_ENABLE_FTS5 is not defined, omit this file.
+ifcapable !fts5 {
+ finish_test
+ return
+}
+
+do_execsql_test 1.0 {
+ CREATE VIRTUAL TABLE vt USING fts5(x);
+ INSERT INTO vt VALUES('abc');
+ INSERT INTO vt VALUES('xyz');
+
+ CREATE TABLE t1(a INTEGER PRIMARY KEY);
+ INSERT INTO t1 VALUES(1), (2);
+}
+
+do_execsql_test 1.1 {
+ SELECT * FROM t1 LEFT JOIN (
+ SELECT rowid AS rrr, * FROM vt WHERE vt MATCH 'abc'
+ ) ON t1.a = rrr
+} {1 1 abc 2 {} {}}
+
+do_execsql_test 1.2 {
+ SELECT * FROM t1 LEFT JOIN vt ON (vt MATCH 'abc')
+} {1 abc 2 abc}
+
+finish_test
diff --git a/ext/fts5/test/fts5matchinfo.test b/ext/fts5/test/fts5matchinfo.test
index 99b0767..4dc04b7 100644
--- a/ext/fts5/test/fts5matchinfo.test
+++ b/ext/fts5/test/fts5matchinfo.test
@@ -472,7 +472,7 @@ do_execsql_test 12.1 {
#
reset_db
proc xyz {} {}
-db func fts5 -argcount 0 xyz
+db func fts5 -argcount 1 xyz
do_test 13.1 {
list [catch { sqlite3_fts5_register_matchinfo db } msg] $msg
} {1 SQLITE_ERROR}
@@ -492,4 +492,3 @@ do_catchsql_test 14.2 {
} {1 {unrecognized matchinfo flag: d}}
finish_test
-
diff --git a/ext/fts5/test/fts5merge.test b/ext/fts5/test/fts5merge.test
index 73e006a..3b86167 100644
--- a/ext/fts5/test/fts5merge.test
+++ b/ext/fts5/test/fts5merge.test
@@ -241,4 +241,3 @@ do_execsql_test 6.3 {
finish_test
-
diff --git a/ext/fts5/test/fts5merge2.test b/ext/fts5/test/fts5merge2.test
index 0d3fad8..e8f5bb1 100644
--- a/ext/fts5/test/fts5merge2.test
+++ b/ext/fts5/test/fts5merge2.test
@@ -55,4 +55,3 @@ do_execsql_test 1.2 {
}
finish_test
-
diff --git a/ext/fts5/test/fts5multiclient.test b/ext/fts5/test/fts5multiclient.test
index a1e9aa0..c3716fc 100644
--- a/ext/fts5/test/fts5multiclient.test
+++ b/ext/fts5/test/fts5multiclient.test
@@ -45,4 +45,3 @@ do_multiclient_test tn {
};# do_multiclient_test
};# foreach_detail_mode
finish_test
-
diff --git a/ext/fts5/test/fts5near.test b/ext/fts5/test/fts5near.test
index b4ae205..bbe144a 100644
--- a/ext/fts5/test/fts5near.test
+++ b/ext/fts5/test/fts5near.test
@@ -68,4 +68,3 @@ do_near_test 1.25 "a b c d e f g h i" { NEAR(i a+b+c+d b+c, 4) } 0
finish_test
-
diff --git a/ext/fts5/test/fts5onepass.test b/ext/fts5/test/fts5onepass.test
index a614b78..01021ed 100644
--- a/ext/fts5/test/fts5onepass.test
+++ b/ext/fts5/test/fts5onepass.test
@@ -178,4 +178,3 @@ do_execsql_test 4.3.1 {
do_test 4.2.2 { fts5_level_segs ttt } {3}
finish_test
-
diff --git a/ext/fts5/test/fts5optimize.test b/ext/fts5/test/fts5optimize.test
index 3ef6d8a..e0f0fd7 100644
--- a/ext/fts5/test/fts5optimize.test
+++ b/ext/fts5/test/fts5optimize.test
@@ -106,4 +106,3 @@ foreach {tn nStep} {
do_test 2.$tn.6 { fts5_segcount t1 } 1
}
finish_test
-
diff --git a/ext/fts5/test/fts5phrase.test b/ext/fts5/test/fts5phrase.test
index 6dac684..10598cc 100644
--- a/ext/fts5/test/fts5phrase.test
+++ b/ext/fts5/test/fts5phrase.test
@@ -116,4 +116,3 @@ do_execsql_test 2.0 {
}
finish_test
-
diff --git a/ext/fts5/test/fts5plan.test b/ext/fts5/test/fts5plan.test
index d7f5fd6..a7b70f5 100644
--- a/ext/fts5/test/fts5plan.test
+++ b/ext/fts5/test/fts5plan.test
@@ -30,7 +30,7 @@ do_eqp_test 1.1 {
SELECT * FROM t1, f1 WHERE f1 MATCH t1.x
} {
0 0 0 {SCAN TABLE t1}
- 0 1 1 {SCAN TABLE f1 VIRTUAL TABLE INDEX 1:}
+ 0 1 1 {SCAN TABLE f1 VIRTUAL TABLE INDEX 65537:}
}
do_eqp_test 1.2 {
@@ -43,7 +43,7 @@ do_eqp_test 1.2 {
do_eqp_test 1.3 {
SELECT * FROM f1 WHERE f1 MATCH ? ORDER BY ff
} {
- 0 0 0 {SCAN TABLE f1 VIRTUAL TABLE INDEX 1:}
+ 0 0 0 {SCAN TABLE f1 VIRTUAL TABLE INDEX 65537:}
0 0 0 {USE TEMP B-TREE FOR ORDER BY}
}
@@ -64,4 +64,3 @@ do_eqp_test 1.5 {
finish_test
-
diff --git a/ext/fts5/test/fts5porter.test b/ext/fts5/test/fts5porter.test
index 2535eb7..c7b1ce6 100644
--- a/ext/fts5/test/fts5porter.test
+++ b/ext/fts5/test/fts5porter.test
@@ -11803,4 +11803,3 @@ foreach {in out} $test_vocab {
finish_test
-
diff --git a/ext/fts5/test/fts5porter2.test b/ext/fts5/test/fts5porter2.test
index 5e0aeb0..6e81b2d 100644
--- a/ext/fts5/test/fts5porter2.test
+++ b/ext/fts5/test/fts5porter2.test
@@ -67,4 +67,3 @@ foreach {in out} $test_vocab {
finish_test
-
diff --git a/ext/fts5/test/fts5prefix.test b/ext/fts5/test/fts5prefix.test
index 8e0d5a2..279f312 100644
--- a/ext/fts5/test/fts5prefix.test
+++ b/ext/fts5/test/fts5prefix.test
@@ -9,7 +9,7 @@
#
#***********************************************************************
#
-# This file containst tests focused on prefix indexes.
+# This file contains tests focused on prefix indexes.
#
source [file join [file dirname [info script]] fts5_common.tcl]
@@ -341,5 +341,3 @@ foreach {tn create} {
}
finish_test
-
-
diff --git a/ext/fts5/test/fts5query.test b/ext/fts5/test/fts5query.test
index 570abf4..854651e 100644
--- a/ext/fts5/test/fts5query.test
+++ b/ext/fts5/test/fts5query.test
@@ -79,5 +79,3 @@ for {set tn 1 ; set pgsz 64} {$tn<32} {incr tn; incr pgsz 16} {
finish_test
-
-
diff --git a/ext/fts5/test/fts5rank.test b/ext/fts5/test/fts5rank.test
index a70c5d6..1268eeb 100644
--- a/ext/fts5/test/fts5rank.test
+++ b/ext/fts5/test/fts5rank.test
@@ -90,6 +90,7 @@ do_test 2.7 {
execsql { SELECT rowid FROM tt('a') ORDER BY rank; } db
} {1 3 2}
+db2 close
#--------------------------------------------------------------------------
# At one point there was a problem with queries such as:
@@ -151,4 +152,3 @@ do_execsql_test 4.1 {
finish_test
-
diff --git a/ext/fts5/test/fts5rebuild.test b/ext/fts5/test/fts5rebuild.test
index 1044421..0191831 100644
--- a/ext/fts5/test/fts5rebuild.test
+++ b/ext/fts5/test/fts5rebuild.test
@@ -64,4 +64,3 @@ do_catchsql_test 2.2 {
INSERT INTO nc(nc) VALUES('rebuild');
} {1 {'rebuild' may not be used with a contentless fts5 table}}
finish_test
-
diff --git a/ext/fts5/test/fts5restart.test b/ext/fts5/test/fts5restart.test
index 0dd7d69..db2c62b 100644
--- a/ext/fts5/test/fts5restart.test
+++ b/ext/fts5/test/fts5restart.test
@@ -149,4 +149,3 @@ do_test 4.3 {
finish_test
-
diff --git a/ext/fts5/test/fts5rowid.test b/ext/fts5/test/fts5rowid.test
index 19590cd..e7fd1bc 100644
--- a/ext/fts5/test/fts5rowid.test
+++ b/ext/fts5/test/fts5rowid.test
@@ -216,4 +216,3 @@ do_execsql_test 6.2 {
finish_test
-
diff --git a/ext/fts5/test/fts5simple.test b/ext/fts5/test/fts5simple.test
index 5ac413c..7fb0681 100644
--- a/ext/fts5/test/fts5simple.test
+++ b/ext/fts5/test/fts5simple.test
@@ -411,7 +411,6 @@ do_catchsql_test 19.2 {
#-------------------------------------------------------------------------
reset_db
-breakpoint
do_execsql_test 20.0 {
CREATE VIRTUAL TABLE x1 USING fts5(x);
INSERT INTO x1(x1, rank) VALUES('pgsz', 32);
diff --git a/ext/fts5/test/fts5simple2.test b/ext/fts5/test/fts5simple2.test
index 186d771..e57cea7 100644
--- a/ext/fts5/test/fts5simple2.test
+++ b/ext/fts5/test/fts5simple2.test
@@ -332,7 +332,41 @@ do_execsql_test 16.0 {
DELETE FROM t2;
}
+#-------------------------------------------------------------------------
+#
+reset_db
+do_execsql_test 17.0 {
+ CREATE VIRTUAL TABLE t2 USING fts5(x, y);
+ BEGIN;
+ INSERT INTO t2 VALUES('a aa aaa', 'b bb bbb');
+ INSERT INTO t2 VALUES('a aa aaa', 'b bb bbb');
+ INSERT INTO t2 VALUES('a aa aaa', 'b bb bbb');
+ COMMIT;
+}
+do_execsql_test 17.1 { SELECT * FROM t2('y:a*') WHERE rowid BETWEEN 10 AND 20 }
+do_execsql_test 17.2 {
+ BEGIN;
+ INSERT INTO t2 VALUES('a aa aaa', 'b bb bbb');
+ SELECT * FROM t2('y:a*') WHERE rowid BETWEEN 10 AND 20 ;
+}
+do_execsql_test 17.3 {
+ COMMIT
+}
+
+reset_db
+do_execsql_test 17.4 {
+ CREATE VIRTUAL TABLE t2 USING fts5(x, y);
+ BEGIN;
+ INSERT INTO t2 VALUES('a aa aaa', 'b bb bbb');
+ INSERT INTO t2 VALUES('a aa aaa', 'b bb bbb');
+ SELECT * FROM t2('y:a*') WHERE rowid>66;
+}
+do_execsql_test 17.5 { SELECT * FROM t2('x:b* OR y:a*') }
+do_execsql_test 17.5 { COMMIT ; SELECT * FROM t2('x:b* OR y:a*') }
+do_execsql_test 17.6 {
+ SELECT * FROM t2('x:b* OR y:a*') WHERE rowid>55
+}
+
#db eval {SELECT rowid, fts5_decode_none(rowid, block) aS r FROM t2_data} {puts $r}
finish_test
-
diff --git a/ext/fts5/test/fts5simple3.test b/ext/fts5/test/fts5simple3.test
index c755ea0..0d4972b 100644
--- a/ext/fts5/test/fts5simple3.test
+++ b/ext/fts5/test/fts5simple3.test
@@ -116,4 +116,3 @@ do_execsql_test 4.6 {
finish_test
-
diff --git a/ext/fts5/test/fts5synonym.test b/ext/fts5/test/fts5synonym.test
index 185dda3..86610ee 100644
--- a/ext/fts5/test/fts5synonym.test
+++ b/ext/fts5/test/fts5synonym.test
@@ -152,7 +152,7 @@ foreach {tn expr res} {
1 {abc} {"abc"}
2 {one} {"one"|"i"|"1"}
3 {3} {"3"|"iii"|"three"}
- 4 {3*} {"3"|"iii"|"three" *}
+ 4 {3*} {"3" *}
} {
do_execsql_test 4.1.$tn {
SELECT fts5_expr($expr, 'tokenize=tclnum')
@@ -421,4 +421,3 @@ do_execsql_test 7.1.2 {
} ;# foreach_detail_mode
finish_test
-
diff --git a/ext/fts5/test/fts5synonym2.test b/ext/fts5/test/fts5synonym2.test
index 7e92822..8bbfb07 100644
--- a/ext/fts5/test/fts5synonym2.test
+++ b/ext/fts5/test/fts5synonym2.test
@@ -161,4 +161,3 @@ foreach {tn expr} {
}
finish_test
-
diff --git a/ext/fts5/test/fts5tok1.test b/ext/fts5/test/fts5tok1.test
index 6ba1700..a336f11 100644
--- a/ext/fts5/test/fts5tok1.test
+++ b/ext/fts5/test/fts5tok1.test
@@ -109,7 +109,7 @@ do_catchsql_test 2.0 {
do_catchsql_test 2.1 {
CREATE VIRTUAL TABLE t4 USING fts5tokenize;
SELECT * FROM t4;
-} {1 {SQL logic error or missing database}}
+} {1 {SQL logic error}}
finish_test
diff --git a/ext/fts5/test/fts5tokenizer.test b/ext/fts5/test/fts5tokenizer.test
index 9316d3c..e71979e 100644
--- a/ext/fts5/test/fts5tokenizer.test
+++ b/ext/fts5/test/fts5tokenizer.test
@@ -262,5 +262,43 @@ do_execsql_test 8.3 {
brown dog fox jump lazi over quick the
}
-finish_test
+#-------------------------------------------------------------------------
+# Check that the FTS5_TOKENIZE_PREFIX flag is passed to the tokenizer
+# implementation.
+#
+reset_db
+proc tcl_create {args} { return "tcl_tokenize" }
+sqlite3_fts5_create_tokenizer db tcl tcl_create
+set ::flags [list]
+proc tcl_tokenize {tflags text} {
+ lappend ::flags $tflags
+ foreach {w iStart iEnd} [fts5_tokenize_split $text] {
+ sqlite3_fts5_token $w $iStart $iEnd
+ }
+}
+do_execsql_test 9.1.1 {
+ CREATE VIRTUAL TABLE t1 USING fts5(a, tokenize=tcl);
+ INSERT INTO t1 VALUES('abc');
+ INSERT INTO t1 VALUES('xyz');
+} {}
+do_test 9.1.2 { set ::flags } {document document}
+
+set ::flags [list]
+do_execsql_test 9.2.1 { SELECT * FROM t1('abc'); } {abc}
+do_test 9.2.2 { set ::flags } {query}
+
+set ::flags [list]
+do_execsql_test 9.3.1 { SELECT * FROM t1('ab*'); } {abc}
+do_test 9.3.2 { set ::flags } {prefixquery}
+
+set ::flags [list]
+do_execsql_test 9.4.1 { SELECT * FROM t1('"abc xyz" *'); } {}
+do_test 9.4.2 { set ::flags } {prefixquery}
+
+set ::flags [list]
+do_execsql_test 9.5.1 { SELECT * FROM t1('"abc xyz*"'); } {}
+do_test 9.5.2 { set ::flags } {query}
+
+
+finish_test
diff --git a/ext/fts5/test/fts5unicode.test b/ext/fts5/test/fts5unicode.test
index 46f4c4f..a9874cc 100644
--- a/ext/fts5/test/fts5unicode.test
+++ b/ext/fts5/test/fts5unicode.test
@@ -50,7 +50,6 @@ do_execsql_test 2.0 "
INSERT INTO t2 VALUES('\xC0\xC8\xCC');
INSERT INTO t3 VALUES('\xC0\xC8\xCC');
"
-breakpoint
do_execsql_test 2.1 "
SELECT 't1' FROM t1 WHERE t1 MATCH '\xE0\xE8\xEC';
SELECT 't2' FROM t2 WHERE t2 MATCH '\xE0\xE8\xEC';
@@ -59,4 +58,3 @@ do_execsql_test 2.1 "
finish_test
-
diff --git a/ext/fts5/test/fts5unicode2.test b/ext/fts5/test/fts5unicode2.test
index 8e5bb8e..662b9dd 100644
--- a/ext/fts5/test/fts5unicode2.test
+++ b/ext/fts5/test/fts5unicode2.test
@@ -281,7 +281,6 @@ do_test 4.4 {
#-------------------------------------------------------------------------
-breakpoint
do_unicode_token_test3 5.1 {tokenchars {}} {
sqlite3_reset sqlite3_column_int
} {
diff --git a/ext/fts5/test/fts5unicode3.test b/ext/fts5/test/fts5unicode3.test
index 876ad27..99f9c8c 100644
--- a/ext/fts5/test/fts5unicode3.test
+++ b/ext/fts5/test/fts5unicode3.test
@@ -126,4 +126,3 @@ do_test 1.5 {
finish_test
-
diff --git a/ext/fts5/test/fts5unindexed.test b/ext/fts5/test/fts5unindexed.test
index 16d43f8..8b72c4c 100644
--- a/ext/fts5/test/fts5unindexed.test
+++ b/ext/fts5/test/fts5unindexed.test
@@ -76,4 +76,3 @@ do_execsql_test 3.2 {
finish_test
-
diff --git a/ext/fts5/test/fts5update.test b/ext/fts5/test/fts5update.test
index 399c7ff..b558c2f 100644
--- a/ext/fts5/test/fts5update.test
+++ b/ext/fts5/test/fts5update.test
@@ -117,5 +117,3 @@ do_execsql_test 2.2.integrity {
}
finish_test
-
-
diff --git a/ext/fts5/test/fts5version.test b/ext/fts5/test/fts5version.test
index 7e4d74d..58cd348 100644
--- a/ext/fts5/test/fts5version.test
+++ b/ext/fts5/test/fts5version.test
@@ -61,4 +61,3 @@ do_test 1.7 {
finish_test
-
diff --git a/ext/fts5/test/fts5vocab.test b/ext/fts5/test/fts5vocab.test
index 5e0499f..8c40316 100644
--- a/ext/fts5/test/fts5vocab.test
+++ b/ext/fts5/test/fts5vocab.test
@@ -210,7 +210,6 @@ do_execsql_test 5.0 {
INSERT INTO aux.t1 VALUES('x n z');
}
-breakpoint
do_execsql_test 5.1 {
CREATE VIRTUAL TABLE temp.vm USING fts5vocab(main, t1, row);
CREATE VIRTUAL TABLE temp.vt1 USING fts5vocab(t1, row);
diff --git a/ext/icu/icu.c b/ext/icu/icu.c
index d2beaa3..7c37812 100644
--- a/ext/icu/icu.c
+++ b/ext/icu/icu.c
@@ -493,38 +493,36 @@ static void icuLoadCollation(
** Register the ICU extension functions with database db.
*/
int sqlite3IcuInit(sqlite3 *db){
- struct IcuScalar {
+ static const struct IcuScalar {
const char *zName; /* Function name */
- int nArg; /* Number of arguments */
- int enc; /* Optimal text encoding */
- void *pContext; /* sqlite3_user_data() context */
+ unsigned char nArg; /* Number of arguments */
+ unsigned short enc; /* Optimal text encoding */
+ unsigned char iContext; /* sqlite3_user_data() context */
void (*xFunc)(sqlite3_context*,int,sqlite3_value**);
} scalars[] = {
- {"regexp", 2, SQLITE_ANY|SQLITE_DETERMINISTIC, 0, icuRegexpFunc},
-
- {"lower", 1, SQLITE_UTF16|SQLITE_DETERMINISTIC, 0, icuCaseFunc16},
- {"lower", 2, SQLITE_UTF16|SQLITE_DETERMINISTIC, 0, icuCaseFunc16},
- {"upper", 1, SQLITE_UTF16|SQLITE_DETERMINISTIC, (void*)1, icuCaseFunc16},
- {"upper", 2, SQLITE_UTF16|SQLITE_DETERMINISTIC, (void*)1, icuCaseFunc16},
-
- {"lower", 1, SQLITE_UTF8|SQLITE_DETERMINISTIC, 0, icuCaseFunc16},
- {"lower", 2, SQLITE_UTF8|SQLITE_DETERMINISTIC, 0, icuCaseFunc16},
- {"upper", 1, SQLITE_UTF8|SQLITE_DETERMINISTIC, (void*)1, icuCaseFunc16},
- {"upper", 2, SQLITE_UTF8|SQLITE_DETERMINISTIC, (void*)1, icuCaseFunc16},
-
- {"like", 2, SQLITE_UTF8|SQLITE_DETERMINISTIC, 0, icuLikeFunc},
- {"like", 3, SQLITE_UTF8|SQLITE_DETERMINISTIC, 0, icuLikeFunc},
-
- {"icu_load_collation", 2, SQLITE_UTF8, (void*)db, icuLoadCollation},
+ {"icu_load_collation", 2, SQLITE_UTF8, 1, icuLoadCollation},
+ {"regexp", 2, SQLITE_ANY|SQLITE_DETERMINISTIC, 0, icuRegexpFunc},
+ {"lower", 1, SQLITE_UTF16|SQLITE_DETERMINISTIC, 0, icuCaseFunc16},
+ {"lower", 2, SQLITE_UTF16|SQLITE_DETERMINISTIC, 0, icuCaseFunc16},
+ {"upper", 1, SQLITE_UTF16|SQLITE_DETERMINISTIC, 1, icuCaseFunc16},
+ {"upper", 2, SQLITE_UTF16|SQLITE_DETERMINISTIC, 1, icuCaseFunc16},
+ {"lower", 1, SQLITE_UTF8|SQLITE_DETERMINISTIC, 0, icuCaseFunc16},
+ {"lower", 2, SQLITE_UTF8|SQLITE_DETERMINISTIC, 0, icuCaseFunc16},
+ {"upper", 1, SQLITE_UTF8|SQLITE_DETERMINISTIC, 1, icuCaseFunc16},
+ {"upper", 2, SQLITE_UTF8|SQLITE_DETERMINISTIC, 1, icuCaseFunc16},
+ {"like", 2, SQLITE_UTF8|SQLITE_DETERMINISTIC, 0, icuLikeFunc},
+ {"like", 3, SQLITE_UTF8|SQLITE_DETERMINISTIC, 0, icuLikeFunc},
};
-
int rc = SQLITE_OK;
int i;
+
for(i=0; rc==SQLITE_OK && i<(int)(sizeof(scalars)/sizeof(scalars[0])); i++){
- struct IcuScalar *p = &scalars[i];
+ const struct IcuScalar *p = &scalars[i];
rc = sqlite3_create_function(
- db, p->zName, p->nArg, p->enc, p->pContext, p->xFunc, 0, 0
+ db, p->zName, p->nArg, p->enc,
+ p->iContext ? (void*)db : (void*)0,
+ p->xFunc, 0, 0
);
}
diff --git a/ext/lsm1/Makefile b/ext/lsm1/Makefile
new file mode 100644
index 0000000..a4f8ebd
--- /dev/null
+++ b/ext/lsm1/Makefile
@@ -0,0 +1,56 @@
+#
+# This Makefile is designed for use with main.mk in the root directory of
+# this project. After including main.mk, the users makefile should contain:
+#
+# LSMDIR=$(TOP)/ext/lsm1/
+# LSMOPTS=-fPIC
+# include $(LSMDIR)/Makefile
+#
+# The most useful targets are [lsmtest] and [lsm.so].
+#
+
+LSMOBJ = \
+ lsm_ckpt.o \
+ lsm_file.o \
+ lsm_log.o \
+ lsm_main.o \
+ lsm_mem.o \
+ lsm_mutex.o \
+ lsm_shared.o \
+ lsm_sorted.o \
+ lsm_str.o \
+ lsm_tree.o \
+ lsm_unix.o \
+ lsm_win32.o \
+ lsm_varint.o \
+ lsm_vtab.o
+
+LSMHDR = \
+ $(LSMDIR)/lsm.h \
+ $(LSMDIR)/lsmInt.h
+
+LSMTESTSRC = $(LSMDIR)/lsm-test/lsmtest1.c $(LSMDIR)/lsm-test/lsmtest2.c \
+ $(LSMDIR)/lsm-test/lsmtest3.c $(LSMDIR)/lsm-test/lsmtest4.c \
+ $(LSMDIR)/lsm-test/lsmtest5.c $(LSMDIR)/lsm-test/lsmtest6.c \
+ $(LSMDIR)/lsm-test/lsmtest7.c $(LSMDIR)/lsm-test/lsmtest8.c \
+ $(LSMDIR)/lsm-test/lsmtest9.c \
+ $(LSMDIR)/lsm-test/lsmtest_datasource.c \
+ $(LSMDIR)/lsm-test/lsmtest_func.c $(LSMDIR)/lsm-test/lsmtest_io.c \
+ $(LSMDIR)/lsm-test/lsmtest_main.c $(LSMDIR)/lsm-test/lsmtest_mem.c \
+ $(LSMDIR)/lsm-test/lsmtest_tdb.c $(LSMDIR)/lsm-test/lsmtest_tdb3.c \
+ $(LSMDIR)/lsm-test/lsmtest_util.c $(LSMDIR)/lsm-test/lsmtest_win32.c
+
+
+# all: lsm.so
+
+LSMOPTS += -DLSM_MUTEX_PTHREADS=1 -I$(LSMDIR)
+
+lsm.so: $(LSMOBJ)
+ $(TCCX) -shared -o lsm.so $(LSMOBJ)
+
+%.o: $(LSMDIR)/%.c $(LSMHDR) sqlite3.h
+ $(TCCX) $(LSMOPTS) -c $<
+
+lsmtest$(EXE): $(LSMOBJ) $(LSMTESTSRC) $(LSMTESTHDR) sqlite3.o
+ # $(TCPPX) -c $(TOP)/lsm-test/lsmtest_tdb2.cc
+ $(TCCX) $(LSMOPTS) $(LSMTESTSRC) $(LSMOBJ) sqlite3.o -o lsmtest$(EXE) $(THREADLIB)
diff --git a/ext/lsm1/Makefile.msc b/ext/lsm1/Makefile.msc
new file mode 100644
index 0000000..3e5a3b3
--- /dev/null
+++ b/ext/lsm1/Makefile.msc
@@ -0,0 +1,102 @@
+#
+# This Makefile is designed for use with Makefile.msc in the root directory
+# of this project. The Makefile.msc should contain:
+#
+# LSMDIR=$(TOP)\ext\lsm1
+# !INCLUDE $(LSMDIR)\Makefile.msc
+#
+# The most useful targets are [lsmtest.exe] and [lsm.dll].
+#
+
+LSMOBJ = \
+ lsm_ckpt.lo \
+ lsm_file.lo \
+ lsm_log.lo \
+ lsm_main.lo \
+ lsm_mem.lo \
+ lsm_mutex.lo \
+ lsm_shared.lo \
+ lsm_sorted.lo \
+ lsm_str.lo \
+ lsm_tree.lo \
+ lsm_unix.lo \
+ lsm_win32.lo \
+ lsm_varint.lo \
+ lsm_vtab.lo
+
+LSMHDR = \
+ $(LSMDIR)\lsm.h \
+ $(LSMDIR)\lsmInt.h
+
+LSMTESTSRC = $(LSMDIR)\lsm-test\lsmtest1.c $(LSMDIR)\lsm-test\lsmtest2.c \
+ $(LSMDIR)\lsm-test\lsmtest3.c $(LSMDIR)\lsm-test\lsmtest4.c \
+ $(LSMDIR)\lsm-test\lsmtest5.c $(LSMDIR)\lsm-test\lsmtest6.c \
+ $(LSMDIR)\lsm-test\lsmtest7.c $(LSMDIR)\lsm-test\lsmtest8.c \
+ $(LSMDIR)\lsm-test\lsmtest9.c \
+ $(LSMDIR)\lsm-test\lsmtest_datasource.c \
+ $(LSMDIR)\lsm-test\lsmtest_func.c $(LSMDIR)\lsm-test\lsmtest_io.c \
+ $(LSMDIR)\lsm-test\lsmtest_main.c $(LSMDIR)\lsm-test\lsmtest_mem.c \
+ $(LSMDIR)\lsm-test\lsmtest_tdb.c $(LSMDIR)\lsm-test\lsmtest_tdb3.c \
+ $(LSMDIR)\lsm-test\lsmtest_util.c $(LSMDIR)\lsm-test\lsmtest_win32.c
+
+# all: lsm.dll lsmtest.exe
+
+LSMOPTS = $(NO_WARN) -DLSM_MUTEX_WIN32=1 -I$(LSMDIR)
+
+!IF $(DEBUG)>2
+LSMOPTS = $(LSMOPTS) -DLSM_DEBUG=1
+!ENDIF
+
+!IF $(MEMDEBUG)!=0
+LSMOPTS = $(LSMOPTS) -DLSM_DEBUG_MEM=1
+!ENDIF
+
+lsm_ckpt.lo: $(LSMDIR)\lsm_ckpt.c $(LSMHDR) $(SQLITE3H)
+ $(LTCOMPILE) $(LSMOPTS) -c $(LSMDIR)\lsm_ckpt.c
+
+lsm_file.lo: $(LSMDIR)\lsm_file.c $(LSMHDR) $(SQLITE3H)
+ $(LTCOMPILE) $(LSMOPTS) -c $(LSMDIR)\lsm_file.c
+
+lsm_log.lo: $(LSMDIR)\lsm_log.c $(LSMHDR) $(SQLITE3H)
+ $(LTCOMPILE) $(LSMOPTS) -c $(LSMDIR)\lsm_log.c
+
+lsm_main.lo: $(LSMDIR)\lsm_main.c $(LSMHDR) $(SQLITE3H)
+ $(LTCOMPILE) $(LSMOPTS) -c $(LSMDIR)\lsm_main.c
+
+lsm_mem.lo: $(LSMDIR)\lsm_mem.c $(LSMHDR) $(SQLITE3H)
+ $(LTCOMPILE) $(LSMOPTS) -c $(LSMDIR)\lsm_mem.c
+
+lsm_mutex.lo: $(LSMDIR)\lsm_mutex.c $(LSMHDR) $(SQLITE3H)
+ $(LTCOMPILE) $(LSMOPTS) -c $(LSMDIR)\lsm_mutex.c
+
+lsm_shared.lo: $(LSMDIR)\lsm_shared.c $(LSMHDR) $(SQLITE3H)
+ $(LTCOMPILE) $(LSMOPTS) -c $(LSMDIR)\lsm_shared.c
+
+lsm_sorted.lo: $(LSMDIR)\lsm_sorted.c $(LSMHDR) $(SQLITE3H)
+ $(LTCOMPILE) $(LSMOPTS) -c $(LSMDIR)\lsm_sorted.c
+
+lsm_str.lo: $(LSMDIR)\lsm_str.c $(LSMHDR) $(SQLITE3H)
+ $(LTCOMPILE) $(LSMOPTS) -c $(LSMDIR)\lsm_str.c
+
+lsm_tree.lo: $(LSMDIR)\lsm_tree.c $(LSMHDR) $(SQLITE3H)
+ $(LTCOMPILE) $(LSMOPTS) -c $(LSMDIR)\lsm_tree.c
+
+lsm_unix.lo: $(LSMDIR)\lsm_unix.c $(LSMHDR) $(SQLITE3H)
+ $(LTCOMPILE) $(LSMOPTS) -c $(LSMDIR)\lsm_unix.c
+
+lsm_win32.lo: $(LSMDIR)\lsm_win32.c $(LSMHDR) $(SQLITE3H)
+ $(LTCOMPILE) $(LSMOPTS) -c $(LSMDIR)\lsm_win32.c
+
+lsm_varint.lo: $(LSMDIR)\lsm_varint.c $(LSMHDR) $(SQLITE3H)
+ $(LTCOMPILE) $(LSMOPTS) -c $(LSMDIR)\lsm_varint.c
+
+lsm_vtab.lo: $(LSMDIR)\lsm_vtab.c $(LSMHDR) $(SQLITE3H)
+ $(LTCOMPILE) $(LSMOPTS) -c $(LSMDIR)\lsm_vtab.c
+
+lsm.dll: $(LSMOBJ)
+ $(LD) $(LDFLAGS) $(LTLINKOPTS) $(LTLIBPATHS) /DLL /OUT:$@ $(LSMOBJ)
+ copy /Y $@ $(LSMDIR)\$@
+
+lsmtest.exe: $(LSMOBJ) $(LSMTESTSRC) $(LSMTESTHDR) $(LIBOBJ)
+ $(LTLINK) $(LSMOPTS) $(LSMTESTSRC) /link $(LSMOBJ) $(LIBOBJ)
+ copy /Y $@ $(LSMDIR)\$@
diff --git a/ext/lsm1/lsm-test/README b/ext/lsm1/lsm-test/README
new file mode 100644
index 0000000..80654ee
--- /dev/null
+++ b/ext/lsm1/lsm-test/README
@@ -0,0 +1,40 @@
+
+
+Organization of test case files:
+
+ lsmtest1.c: Data tests. Tests that perform many inserts and deletes on a
+ database file, then verify that the contents of the database can
+ be queried.
+
+ lsmtest2.c: Crash tests. Tests that attempt to verify that the database
+ recovers correctly following an application or system crash.
+
+ lsmtest3.c: Rollback tests. Tests that focus on the explicit rollback of
+ transactions and sub-transactions.
+
+ lsmtest4.c: Multi-client tests.
+
+ lsmtest5.c: Multi-client tests with a different thread for each client.
+
+ lsmtest6.c: OOM injection tests.
+
+ lsmtest7.c: API tests.
+
+ lsmtest8.c: Writer crash tests. Tests in this file attempt to verify that
+ the system recovers and other clients proceed unaffected if
+ a process fails in the middle of a write transaction.
+
+ The difference from lsmtest2.c is that this file tests
+ live-recovery (recovery from a failure that occurs while other
+ clients are still running) whereas lsmtest2.c tests recovery
+ from a system or power failure.
+
+ lsmtest9.c: More data tests. These focus on testing that calling
+ lsm_work(nMerge=1) to compact the database does not corrupt it.
+ In other words, that databases containing block-redirects
+ can be read and written.
+
+
+
+
+
diff --git a/ext/lsm1/lsm-test/lsmtest.h b/ext/lsm1/lsm-test/lsmtest.h
new file mode 100644
index 0000000..249bc99
--- /dev/null
+++ b/ext/lsm1/lsm-test/lsmtest.h
@@ -0,0 +1,302 @@
+
+#ifndef __WRAPPER_INT_H_
+#define __WRAPPER_INT_H_
+
+#include "lsmtest_tdb.h"
+#include "sqlite3.h"
+#include "lsm.h"
+
+#include
+#include
+#include
+#include
+#include
+#ifndef _WIN32
+# include
+#endif
+#include
+#include
+#include
+#include
+#include
+#include
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef _WIN32
+# include "windows.h"
+# define gettimeofday win32GetTimeOfDay
+# define F_OK (0)
+# define sleep(sec) Sleep(1000 * (sec))
+# define usleep(usec) Sleep(((usec) + 999) / 1000)
+# ifdef _MSC_VER
+# include
+# define snprintf _snprintf
+# define fsync(fd) FlushFileBuffers((HANDLE)_get_osfhandle((fd)))
+# define fdatasync(fd) FlushFileBuffers((HANDLE)_get_osfhandle((fd)))
+# define __va_copy(dst,src) ((dst) = (src))
+# define ftruncate(fd,sz) ((_chsize_s((fd), (sz))==0) ? 0 : -1)
+# else
+# error Unsupported C compiler for Windows.
+# endif
+int win32GetTimeOfDay(struct timeval *, void *);
+#endif
+
+#ifndef _LSM_INT_H
+typedef unsigned int u32;
+typedef unsigned char u8;
+typedef long long int i64;
+typedef unsigned long long int u64;
+#endif
+
+
+#define ArraySize(x) ((int)(sizeof(x) / sizeof((x)[0])))
+
+#define MIN(x,y) ((x)<(y) ? (x) : (y))
+#define MAX(x,y) ((x)>(y) ? (x) : (y))
+
+#define unused_parameter(x) (void)(x)
+
+#define TESTDB_DEFAULT_PAGE_SIZE 4096
+#define TESTDB_DEFAULT_CACHE_SIZE 2048
+
+#ifndef _O_BINARY
+# define _O_BINARY (0)
+#endif
+
+/*
+** Ideally, these should be in wrapper.c. But they are here instead so that
+** they can be used by the C++ database wrappers in wrapper2.cc.
+*/
+typedef struct DatabaseMethods DatabaseMethods;
+struct TestDb {
+ DatabaseMethods const *pMethods; /* Database methods */
+ const char *zLibrary; /* Library name for tdb_open() */
+};
+struct DatabaseMethods {
+ int (*xClose)(TestDb *);
+ int (*xWrite)(TestDb *, void *, int , void *, int);
+ int (*xDelete)(TestDb *, void *, int);
+ int (*xDeleteRange)(TestDb *, void *, int, void *, int);
+ int (*xFetch)(TestDb *, void *, int, void **, int *);
+ int (*xScan)(TestDb *, void *, int, void *, int, void *, int,
+ void (*)(void *, void *, int , void *, int)
+ );
+ int (*xBegin)(TestDb *, int);
+ int (*xCommit)(TestDb *, int);
+ int (*xRollback)(TestDb *, int);
+};
+
+/*
+** Functions in wrapper2.cc (a C++ source file). wrapper2.cc contains the
+** wrapper for Kyoto Cabinet. Kyoto cabinet has a C API, but
+** the primary interface is the C++ API.
+*/
+int test_kc_open(const char*, const char *zFilename, int bClear, TestDb **ppDb);
+int test_kc_close(TestDb *);
+int test_kc_write(TestDb *, void *, int , void *, int);
+int test_kc_delete(TestDb *, void *, int);
+int test_kc_delete_range(TestDb *, void *, int, void *, int);
+int test_kc_fetch(TestDb *, void *, int, void **, int *);
+int test_kc_scan(TestDb *, void *, int, void *, int, void *, int,
+ void (*)(void *, void *, int , void *, int)
+);
+
+int test_mdb_open(const char*, const char *zFile, int bClear, TestDb **ppDb);
+int test_mdb_close(TestDb *);
+int test_mdb_write(TestDb *, void *, int , void *, int);
+int test_mdb_delete(TestDb *, void *, int);
+int test_mdb_fetch(TestDb *, void *, int, void **, int *);
+int test_mdb_scan(TestDb *, void *, int, void *, int, void *, int,
+ void (*)(void *, void *, int , void *, int)
+);
+
+/*
+** Functions in wrapper3.c. This file contains the tdb wrapper for lsm.
+** The wrapper for lsm is a bit more involved than the others, as it
+** includes code for a couple of different lsm configurations, and for
+** various types of fault injection and robustness testing.
+*/
+int test_lsm_open(const char*, const char *zFile, int bClear, TestDb **ppDb);
+int test_lsm_lomem_open(const char*, const char*, int bClear, TestDb **ppDb);
+int test_lsm_zip_open(const char*, const char*, int bClear, TestDb **ppDb);
+int test_lsm_small_open(const char*, const char*, int bClear, TestDb **ppDb);
+int test_lsm_mt2(const char*, const char *zFile, int bClear, TestDb **ppDb);
+int test_lsm_mt3(const char*, const char *zFile, int bClear, TestDb **ppDb);
+
+int tdb_lsm_configure(lsm_db *, const char *);
+
+/* Functions in lsmtest_tdb4.c */
+int test_bt_open(const char*, const char *zFile, int bClear, TestDb **ppDb);
+int test_fbt_open(const char*, const char *zFile, int bClear, TestDb **ppDb);
+int test_fbts_open(const char*, const char *zFile, int bClear, TestDb **ppDb);
+
+
+/* Functions in testutil.c. */
+int testPrngInit(void);
+u32 testPrngValue(u32 iVal);
+void testPrngArray(u32 iVal, u32 *aOut, int nOut);
+void testPrngString(u32 iVal, char *aOut, int nOut);
+
+void testErrorInit(int argc, char **);
+void testPrintError(const char *zFormat, ...);
+void testPrintUsage(const char *zArgs);
+void testPrintFUsage(const char *zFormat, ...);
+void testTimeInit(void);
+int testTimeGet(void);
+
+/* Functions in testmem.c. */
+void testMallocInstall(lsm_env *pEnv);
+void testMallocUninstall(lsm_env *pEnv);
+void testMallocCheck(lsm_env *pEnv, int *, int *, FILE *);
+void testMallocOom(lsm_env *pEnv, int, int, void(*)(void*), void *);
+void testMallocOomEnable(lsm_env *pEnv, int);
+
+/* lsmtest.c */
+TestDb *testOpen(const char *zSystem, int, int *pRc);
+void testReopen(TestDb **ppDb, int *pRc);
+void testClose(TestDb **ppDb);
+
+void testFetch(TestDb *, void *, int, void *, int, int *);
+void testWrite(TestDb *, void *, int, void *, int, int *);
+void testDelete(TestDb *, void *, int, int *);
+void testDeleteRange(TestDb *, void *, int, void *, int, int *);
+void testWriteStr(TestDb *, const char *, const char *zVal, int *pRc);
+void testFetchStr(TestDb *, const char *, const char *, int *pRc);
+
+void testBegin(TestDb *pDb, int iTrans, int *pRc);
+void testCommit(TestDb *pDb, int iTrans, int *pRc);
+
+void test_failed(void);
+
+char *testMallocPrintf(const char *zFormat, ...);
+char *testMallocVPrintf(const char *zFormat, va_list ap);
+int testGlobMatch(const char *zPattern, const char *zStr);
+
+void testScanCompare(TestDb *, TestDb *, int, void *, int, void *, int, int *);
+void testFetchCompare(TestDb *, TestDb *, void *, int, int *);
+
+void *testMalloc(int);
+void *testMallocCopy(void *pCopy, int nByte);
+void *testRealloc(void *, int);
+void testFree(void *);
+
+/* lsmtest_bt.c */
+int do_bt(int nArg, char **azArg);
+
+/* testio.c */
+int testVfsConfigureDb(TestDb *pDb);
+
+/* testfunc.c */
+int do_show(int nArg, char **azArg);
+int do_work(int nArg, char **azArg);
+
+/* testio.c */
+int do_io(int nArg, char **azArg);
+
+/* lsmtest2.c */
+void do_crash_test(const char *zPattern, int *pRc);
+int do_rollback_test(int nArg, char **azArg);
+
+/* test3.c */
+void test_rollback(const char *zSystem, const char *zPattern, int *pRc);
+
+/* test4.c */
+void test_mc(const char *zSystem, const char *zPattern, int *pRc);
+
+/* test5.c */
+void test_mt(const char *zSystem, const char *zPattern, int *pRc);
+
+/* lsmtest6.c */
+void test_oom(const char *zPattern, int *pRc);
+void testDeleteLsmdb(const char *zFile);
+
+void testSaveDb(const char *zFile, const char *zAuxExt);
+void testRestoreDb(const char *zFile, const char *zAuxExt);
+void testCopyLsmdb(const char *zFrom, const char *zTo);
+
+/* lsmtest7.c */
+void test_api(const char *zPattern, int *pRc);
+
+/* lsmtest8.c */
+void do_writer_crash_test(const char *zPattern, int *pRc);
+
+/*************************************************************************
+** Interface to functionality in test_datasource.c.
+*/
+typedef struct Datasource Datasource;
+typedef struct DatasourceDefn DatasourceDefn;
+
+struct DatasourceDefn {
+ int eType; /* A TEST_DATASOURCE_* value */
+ int nMinKey; /* Minimum key size */
+ int nMaxKey; /* Maximum key size */
+ int nMinVal; /* Minimum value size */
+ int nMaxVal; /* Maximum value size */
+};
+
+#define TEST_DATASOURCE_RANDOM 1
+#define TEST_DATASOURCE_SEQUENCE 2
+
+char *testDatasourceName(const DatasourceDefn *);
+Datasource *testDatasourceNew(const DatasourceDefn *);
+void testDatasourceFree(Datasource *);
+void testDatasourceEntry(Datasource *, int, void **, int *, void **, int *);
+/* End of test_datasource.c interface.
+*************************************************************************/
+void testDatasourceFetch(
+ TestDb *pDb, /* Database handle */
+ Datasource *pData,
+ int iKey,
+ int *pRc /* IN/OUT: Error code */
+);
+
+void testWriteDatasource(TestDb *, Datasource *, int, int *);
+void testWriteDatasourceRange(TestDb *, Datasource *, int, int, int *);
+void testDeleteDatasource(TestDb *, Datasource *, int, int *);
+void testDeleteDatasourceRange(TestDb *, Datasource *, int, int, int *);
+
+
+/* test1.c */
+void test_data_1(const char *, const char *, int *pRc);
+void test_data_2(const char *, const char *, int *pRc);
+void test_data_3(const char *, const char *, int *pRc);
+void testDbContents(TestDb *, Datasource *, int, int, int, int, int, int *);
+void testCaseProgress(int, int, int, int *);
+int testCaseNDot(void);
+
+void testCompareDb(Datasource *, int, int, TestDb *, TestDb *, int *);
+int testControlDb(TestDb **ppDb);
+
+typedef struct CksumDb CksumDb;
+CksumDb *testCksumArrayNew(Datasource *, int, int, int);
+char *testCksumArrayGet(CksumDb *, int);
+void testCksumArrayFree(CksumDb *);
+void testCaseStart(int *pRc, char *zFmt, ...);
+void testCaseFinish(int rc);
+void testCaseSkip(void);
+int testCaseBegin(int *, const char *, const char *, ...);
+
+#define TEST_CKSUM_BYTES 29
+int testCksumDatabase(TestDb *pDb, char *zOut);
+int testCountDatabase(TestDb *pDb);
+void testCompareInt(int, int, int *);
+void testCompareStr(const char *z1, const char *z2, int *pRc);
+
+/* lsmtest9.c */
+void test_data_4(const char *, const char *, int *pRc);
+
+
+/*
+** Similar to the Tcl_GetIndexFromObjStruct() Tcl library function.
+*/
+#define testArgSelect(w,x,y,z) testArgSelectX(w,x,sizeof(w[0]),y,z)
+int testArgSelectX(void *, const char *, int, const char *, int *);
+
+#ifdef __cplusplus
+} /* End of the 'extern "C"' block */
+#endif
+
+#endif
diff --git a/ext/lsm1/lsm-test/lsmtest1.c b/ext/lsm1/lsm-test/lsmtest1.c
new file mode 100644
index 0000000..665dc15
--- /dev/null
+++ b/ext/lsm1/lsm-test/lsmtest1.c
@@ -0,0 +1,654 @@
+
+#include "lsmtest.h"
+
+#define DATA_SEQUENTIAL TEST_DATASOURCE_SEQUENCE
+#define DATA_RANDOM TEST_DATASOURCE_RANDOM
+
+typedef struct Datatest1 Datatest1;
+typedef struct Datatest2 Datatest2;
+
+/*
+** An instance of the following structure contains parameters used to
+** customize the test function in this file. Test procedure:
+**
+** 1. Create a data-source based on the "datasource definition" vars.
+**
+** 2. Insert nRow key value pairs into the database.
+**
+** 3. Delete all keys from the database. Deletes are done in the same
+** order as the inserts.
+**
+** During steps 2 and 3 above, after each Datatest1.nVerify inserts or
+** deletes, the following:
+**
+** a. Run Datasource.nTest key lookups and check the results are as expected.
+**
+** b. If Datasource.bTestScan is true, run a handful (8) of range
+** queries (scanning forwards and backwards). Check that the results
+** are as expected.
+**
+** c. Close and reopen the database. Then run (a) and (b) again.
+*/
+struct Datatest1 {
+ /* Datasource definition */
+ DatasourceDefn defn;
+
+ /* Test procedure parameters */
+ int nRow; /* Number of rows to insert then delete */
+ int nVerify; /* How often to verify the db contents */
+ int nTest; /* Number of keys to test (0==all) */
+ int bTestScan; /* True to do scan tests */
+};
+
+/*
+** An instance of the following data structure is used to describe the
+** second type of test case in this file. The chief difference between
+** these tests and those described by Datatest1 is that these tests also
+** experiment with range-delete operations. Tests proceed as follows:
+**
+** 1. Open the datasource described by Datatest2.defn.
+**
+** 2. Open a connection on an empty database.
+**
+** 3. Do this Datatest2.nIter times:
+**
+** a) Insert Datatest2.nWrite key-value pairs from the datasource.
+**
+** b) Select two pseudo-random keys and use them as the start
+** and end points of a range-delete operation.
+**
+** c) Verify that the contents of the database are as expected (see
+** below for details).
+**
+** d) Close and then reopen the database handle.
+**
+** e) Verify that the contents of the database are still as expected.
+**
+** The inserts and range deletes are run twice - once on the database being
+** tested and once using a control system (sqlite3, kc etc. - something that
+** works). In order to verify that the contents of the db being tested are
+** correct, the test runs a bunch of scans and lookups on both the test and
+** control databases. If the results are the same, the test passes.
+*/
+struct Datatest2 {
+ DatasourceDefn defn;
+ int nRange;
+ int nWrite; /* Number of writes per iteration */
+ int nIter; /* Total number of iterations to run */
+};
+
+/*
+** Generate a unique name for the test case pTest with database system
+** zSystem.
+*/
+static char *getName(const char *zSystem, int bRecover, Datatest1 *pTest){
+ char *zRet;
+ char *zData;
+ zData = testDatasourceName(&pTest->defn);
+ zRet = testMallocPrintf("data.%s.%s.rec=%d.%d.%d",
+ zSystem, zData, bRecover, pTest->nRow, pTest->nVerify
+ );
+ testFree(zData);
+ return zRet;
+}
+
+int testControlDb(TestDb **ppDb){
+#ifdef HAVE_KYOTOCABINET
+ return tdb_open("kyotocabinet", "tmp.db", 1, ppDb);
+#else
+ return tdb_open("sqlite3", "", 1, ppDb);
+#endif
+}
+
+void testDatasourceFetch(
+ TestDb *pDb, /* Database handle */
+ Datasource *pData,
+ int iKey,
+ int *pRc /* IN/OUT: Error code */
+){
+ void *pKey; int nKey; /* Database key to query for */
+ void *pVal; int nVal; /* Expected result of query */
+
+ testDatasourceEntry(pData, iKey, &pKey, &nKey, &pVal, &nVal);
+ testFetch(pDb, pKey, nKey, pVal, nVal, pRc);
+}
+
+/*
+** This function is called to test that the contents of database pDb
+** are as expected. In this case, expected is defined as containing
+** key-value pairs iFirst through iLast, inclusive, from data source
+** pData. In other words, a loop like the following could be used to
+** construct a database with identical contents from scratch.
+**
+** for(i=iFirst; i<=iLast; i++){
+** testDatasourceEntry(pData, i, &pKey, &nKey, &pVal, &nVal);
+** // insert (pKey, nKey) -> (pVal, nVal) into database
+** }
+**
+** The key domain consists of keys 0 to (nRow-1), inclusive, from
+** data source pData. For both scan and lookup tests, keys are selected
+** pseudo-randomly from within this set.
+**
+** This function runs nLookupTest lookup tests and nScanTest scan tests.
+**
+** A lookup test consists of selecting a key from the domain and querying
+** pDb for it. The test fails if the presence of the key and, if present,
+** the associated value do not match the expectations defined above.
+**
+** A scan test involves selecting a key from the domain and running
+** the following queries:
+**
+** 1. Scan all keys equal to or greater than the key, in ascending order.
+** 2. Scan all keys equal to or smaller than the key, in descending order.
+**
+** Additionally, if nLookupTest is greater than zero, the following are
+** run once:
+**
+** 1. Scan all keys in the db, in ascending order.
+** 2. Scan all keys in the db, in descending order.
+**
+** As you would assume, the test fails if the returned values do not match
+** expectations.
+*/
+void testDbContents(
+ TestDb *pDb, /* Database handle being tested */
+ Datasource *pData, /* pDb contains data from here */
+ int nRow, /* Size of key domain */
+ int iFirst, /* Index of first key from pData in pDb */
+ int iLast, /* Index of last key from pData in pDb */
+ int nLookupTest, /* Number of lookup tests to run */
+ int nScanTest, /* Number of scan tests to run */
+ int *pRc /* IN/OUT: Error code */
+){
+ int j;
+ int rc = *pRc;
+
+ if( rc==0 && nScanTest ){
+ TestDb *pDb2 = 0;
+
+ /* Open a control db (i.e. one that we assume works) */
+ rc = testControlDb(&pDb2);
+
+ for(j=iFirst; rc==0 && j<=iLast; j++){
+ void *pKey; int nKey; /* Database key to insert */
+ void *pVal; int nVal; /* Database value to insert */
+ testDatasourceEntry(pData, j, &pKey, &nKey, &pVal, &nVal);
+ rc = tdb_write(pDb2, pKey, nKey, pVal, nVal);
+ }
+
+ if( rc==0 ){
+ int iKey1;
+ int iKey2;
+ void *pKey1; int nKey1; /* Start key */
+ void *pKey2; int nKey2; /* Final key */
+
+ iKey1 = testPrngValue((iFirst<<8) + (iLast<<16)) % nRow;
+ iKey2 = testPrngValue((iLast<<8) + (iFirst<<16)) % nRow;
+ testDatasourceEntry(pData, iKey1, &pKey2, &nKey1, 0, 0);
+ pKey1 = testMalloc(nKey1+1);
+ memcpy(pKey1, pKey2, nKey1+1);
+ testDatasourceEntry(pData, iKey2, &pKey2, &nKey2, 0, 0);
+
+ testScanCompare(pDb2, pDb, 0, 0, 0, 0, 0, &rc);
+ testScanCompare(pDb2, pDb, 0, 0, 0, pKey2, nKey2, &rc);
+ testScanCompare(pDb2, pDb, 0, pKey1, nKey1, 0, 0, &rc);
+ testScanCompare(pDb2, pDb, 0, pKey1, nKey1, pKey2, nKey2, &rc);
+ testScanCompare(pDb2, pDb, 1, 0, 0, 0, 0, &rc);
+ testScanCompare(pDb2, pDb, 1, 0, 0, pKey2, nKey2, &rc);
+ testScanCompare(pDb2, pDb, 1, pKey1, nKey1, 0, 0, &rc);
+ testScanCompare(pDb2, pDb, 1, pKey1, nKey1, pKey2, nKey2, &rc);
+ testFree(pKey1);
+ }
+ tdb_close(pDb2);
+ }
+
+ /* Test some lookups. */
+ for(j=0; rc==0 && j=nRow ){
+ iKey = j;
+ }else{
+ iKey = testPrngValue(j + (iFirst<<8) + (iLast<<16)) % nRow;
+ }
+
+ testDatasourceEntry(pData, iKey, &pKey, &nKey, &pVal, &nVal);
+ if( iFirst>iKey || iKey>iLast ){
+ pVal = 0;
+ nVal = -1;
+ }
+
+ testFetch(pDb, pKey, nKey, pVal, nVal, &rc);
+ }
+
+ *pRc = rc;
+}
+
+/*
+** This function should be called during long running test cases to output
+** the progress dots (...) to stdout.
+*/
+void testCaseProgress(int i, int n, int nDot, int *piDot){
+ int iDot = *piDot;
+ while( iDot < ( ((nDot*2+1) * i) / (n*2) ) ){
+ printf(".");
+ fflush(stdout);
+ iDot++;
+ }
+ *piDot = iDot;
+}
+
+int testCaseNDot(void){ return 20; }
+
+#if 0
+static void printScanCb(
+ void *pCtx, void *pKey, int nKey, void *pVal, int nVal
+){
+ printf("%s\n", (char *)pKey);
+ fflush(stdout);
+}
+#endif
+
+void testReopenRecover(TestDb **ppDb, int *pRc){
+ if( *pRc==0 ){
+ const char *zLib = tdb_library_name(*ppDb);
+ const char *zDflt = tdb_default_db(zLib);
+ testCopyLsmdb(zDflt, "bak.db");
+ testClose(ppDb);
+ testCopyLsmdb("bak.db", zDflt);
+ *pRc = tdb_open(zLib, 0, 0, ppDb);
+ }
+}
+
+
+static void doDataTest1(
+ const char *zSystem, /* Database system to test */
+ int bRecover,
+ Datatest1 *p, /* Structure containing test parameters */
+ int *pRc /* OUT: Error code */
+){
+ int i;
+ int iDot;
+ int rc = LSM_OK;
+ Datasource *pData;
+ TestDb *pDb;
+
+ /* Start the test case, open a database and allocate the datasource. */
+ pDb = testOpen(zSystem, 1, &rc);
+ pData = testDatasourceNew(&p->defn);
+
+ i = 0;
+ iDot = 0;
+ while( rc==LSM_OK && inRow ){
+
+ /* Insert some data */
+ testWriteDatasourceRange(pDb, pData, i, p->nVerify, &rc);
+ i += p->nVerify;
+
+ /* Check that the db content is correct. */
+ testDbContents(pDb, pData, p->nRow, 0, i-1, p->nTest, p->bTestScan, &rc);
+
+ if( bRecover ){
+ testReopenRecover(&pDb, &rc);
+ }else{
+ testReopen(&pDb, &rc);
+ }
+
+ /* Check that the db content is still correct. */
+ testDbContents(pDb, pData, p->nRow, 0, i-1, p->nTest, p->bTestScan, &rc);
+
+ /* Update the progress dots... */
+ testCaseProgress(i, p->nRow, testCaseNDot()/2, &iDot);
+ }
+
+ i = 0;
+ iDot = 0;
+ while( rc==LSM_OK && inRow ){
+
+ /* Delete some entries */
+ testDeleteDatasourceRange(pDb, pData, i, p->nVerify, &rc);
+ i += p->nVerify;
+
+ /* Check that the db content is correct. */
+ testDbContents(pDb, pData, p->nRow, i, p->nRow-1,p->nTest,p->bTestScan,&rc);
+
+ /* Close and reopen the database. */
+ if( bRecover ){
+ testReopenRecover(&pDb, &rc);
+ }else{
+ testReopen(&pDb, &rc);
+ }
+
+ /* Check that the db content is still correct. */
+ testDbContents(pDb, pData, p->nRow, i, p->nRow-1,p->nTest,p->bTestScan,&rc);
+
+ /* Update the progress dots... */
+ testCaseProgress(i, p->nRow, testCaseNDot()/2, &iDot);
+ }
+
+ /* Free the datasource, close the database and finish the test case. */
+ testDatasourceFree(pData);
+ tdb_close(pDb);
+ testCaseFinish(rc);
+ *pRc = rc;
+}
+
+
+void test_data_1(
+ const char *zSystem, /* Database system name */
+ const char *zPattern, /* Run test cases that match this pattern */
+ int *pRc /* IN/OUT: Error code */
+){
+ Datatest1 aTest[] = {
+ { {DATA_RANDOM, 500,600, 1000,2000}, 1000, 100, 10, 0},
+ { {DATA_RANDOM, 20,25, 100,200}, 1000, 250, 1000, 1},
+ { {DATA_RANDOM, 8,10, 100,200}, 1000, 250, 1000, 1},
+ { {DATA_RANDOM, 8,10, 10,20}, 1000, 250, 1000, 1},
+ { {DATA_RANDOM, 8,10, 1000,2000}, 1000, 250, 1000, 1},
+ { {DATA_RANDOM, 8,100, 10000,20000}, 100, 25, 100, 1},
+ { {DATA_RANDOM, 80,100, 10,20}, 1000, 250, 1000, 1},
+ { {DATA_RANDOM, 5000,6000, 10,20}, 100, 25, 100, 1},
+ { {DATA_SEQUENTIAL, 5,10, 10,20}, 1000, 250, 1000, 1},
+ { {DATA_SEQUENTIAL, 5,10, 100,200}, 1000, 250, 1000, 1},
+ { {DATA_SEQUENTIAL, 5,10, 1000,2000}, 1000, 250, 1000, 1},
+ { {DATA_SEQUENTIAL, 5,100, 10000,20000}, 100, 25, 100, 1},
+ { {DATA_RANDOM, 10,10, 100,100}, 100000, 1000, 100, 0},
+ { {DATA_SEQUENTIAL, 10,10, 100,100}, 100000, 1000, 100, 0},
+ };
+
+ int i;
+ int bRecover;
+
+ for(bRecover=0; bRecover<2; bRecover++){
+ if( bRecover==1 && memcmp(zSystem, "lsm", 3) ) break;
+ for(i=0; *pRc==LSM_OK && idefn);
+ rc = testControlDb(&pControl);
+
+ if( tdb_lsm(pDb) ){
+ int nBuf = 32 * 1024 * 1024;
+ lsm_config(tdb_lsm(pDb), LSM_CONFIG_AUTOFLUSH, &nBuf);
+ }
+
+ for(i=0; rc==0 && inIter; i++){
+ void *pKey1; int nKey1;
+ void *pKey2; int nKey2;
+ int ii;
+ int nRange = MIN(p->nIter*p->nWrite, p->nRange);
+
+ for(ii=0; rc==0 && iinWrite; ii++){
+ int iKey = (i*p->nWrite + ii) % p->nRange;
+ testWriteDatasource(pControl, pData, iKey, &rc);
+ testWriteDatasource(pDb, pData, iKey, &rc);
+ }
+
+ testDatasourceEntry(pData, i+1000000, &pKey1, &nKey1, 0, 0);
+ pKey1 = testMallocCopy(pKey1, nKey1);
+ testDatasourceEntry(pData, i+2000000, &pKey2, &nKey2, 0, 0);
+
+ testDeleteRange(pDb, pKey1, nKey1, pKey2, nKey2, &rc);
+ testDeleteRange(pControl, pKey1, nKey1, pKey2, nKey2, &rc);
+ testFree(pKey1);
+
+ testCompareDb(pData, nRange, i, pControl, pDb, &rc);
+ if( bRecover ){
+ testReopenRecover(&pDb, &rc);
+ }else{
+ testReopen(&pDb, &rc);
+ }
+ testCompareDb(pData, nRange, i, pControl, pDb, &rc);
+
+ /* Update the progress dots... */
+ testCaseProgress(i, p->nIter, testCaseNDot(), &iDot);
+ }
+
+ testClose(&pDb);
+ testClose(&pControl);
+ testDatasourceFree(pData);
+ testCaseFinish(rc);
+ *pRc = rc;
+}
+
+static char *getName2(const char *zSystem, int bRecover, Datatest2 *pTest){
+ char *zRet;
+ char *zData;
+ zData = testDatasourceName(&pTest->defn);
+ zRet = testMallocPrintf("data2.%s.%s.rec=%d.%d.%d.%d",
+ zSystem, zData, bRecover, pTest->nRange, pTest->nWrite, pTest->nIter
+ );
+ testFree(zData);
+ return zRet;
+}
+
+void test_data_2(
+ const char *zSystem, /* Database system name */
+ const char *zPattern, /* Run test cases that match this pattern */
+ int *pRc /* IN/OUT: Error code */
+){
+ Datatest2 aTest[] = {
+ /* defn, nRange, nWrite, nIter */
+ { {DATA_RANDOM, 20,25, 100,200}, 10000, 10, 50 },
+ { {DATA_RANDOM, 20,25, 100,200}, 10000, 200, 50 },
+ { {DATA_RANDOM, 20,25, 100,200}, 100, 10, 1000 },
+ { {DATA_RANDOM, 20,25, 100,200}, 100, 200, 50 },
+ };
+
+ int i;
+ int bRecover;
+
+ for(bRecover=0; bRecover<2; bRecover++){
+ if( bRecover==1 && memcmp(zSystem, "lsm", 3) ) break;
+ for(i=0; *pRc==LSM_OK && i> 24) & 0xFF;
+ aBuf[1] = (iVal >> 16) & 0xFF;
+ aBuf[2] = (iVal >> 8) & 0xFF;
+ aBuf[3] = (iVal >> 0) & 0xFF;
+}
+
+void dt3PutKey(u8 *aBuf, int iKey){
+ assert( iKey<100000 && iKey>=0 );
+ sprintf((char *)aBuf, "%.5d", iKey);
+}
+
+static void doDataTest3(
+ const char *zSystem, /* Database system to test */
+ Datatest3 *p, /* Structure containing test parameters */
+ int *pRc /* OUT: Error code */
+){
+ int iDot = 0;
+ int rc = *pRc;
+ TestDb *pDb;
+ u8 *abPresent; /* Array of boolean */
+ char *aVal; /* Buffer to hold values */
+ int i;
+ u32 iSeq = 10; /* prng counter */
+
+ abPresent = (u8 *)testMalloc(p->nRange+1);
+ aVal = (char *)testMalloc(p->nValMax+1);
+ pDb = testOpen(zSystem, 1, &rc);
+
+ for(i=0; inIter && rc==0; i++){
+ int ii;
+
+ testCaseProgress(i, p->nIter, testCaseNDot(), &iDot);
+
+ /* Perform nWrite inserts */
+ for(ii=0; iinWrite; ii++){
+ u8 aKey[6];
+ u32 iKey;
+ int nVal;
+
+ iKey = (testPrngValue(iSeq++) % p->nRange) + 1;
+ nVal = (testPrngValue(iSeq++) % (p->nValMax - p->nValMin)) + p->nValMin;
+ testPrngString(testPrngValue(iSeq++), aVal, nVal);
+ dt3PutKey(aKey, iKey);
+
+ testWrite(pDb, aKey, sizeof(aKey)-1, aVal, nVal, &rc);
+ abPresent[iKey] = 1;
+ }
+
+ /* Perform nDelete deletes */
+ for(ii=0; iinDelete; ii++){
+ u8 aKey1[6];
+ u8 aKey2[6];
+ u32 iKey;
+
+ iKey = (testPrngValue(iSeq++) % p->nRange) + 1;
+ dt3PutKey(aKey1, iKey-1);
+ dt3PutKey(aKey2, iKey+1);
+
+ testDeleteRange(pDb, aKey1, sizeof(aKey1)-1, aKey2, sizeof(aKey2)-1, &rc);
+ abPresent[iKey] = 0;
+ }
+
+ testReopen(&pDb, &rc);
+
+ for(ii=1; rc==0 && ii<=p->nRange; ii++){
+ int nDbVal;
+ void *pDbVal;
+ u8 aKey[6];
+ int dbrc;
+
+ dt3PutKey(aKey, ii);
+ dbrc = tdb_fetch(pDb, aKey, sizeof(aKey)-1, &pDbVal, &nDbVal);
+ testCompareInt(0, dbrc, &rc);
+
+ if( abPresent[ii] ){
+ testCompareInt(1, (nDbVal>0), &rc);
+ }else{
+ testCompareInt(1, (nDbVal<0), &rc);
+ }
+ }
+ }
+
+ testClose(&pDb);
+ testCaseFinish(rc);
+ *pRc = rc;
+}
+
+static char *getName3(const char *zSystem, Datatest3 *p){
+ return testMallocPrintf("data3.%s.%d.%d.%d.%d.(%d..%d)",
+ zSystem, p->nRange, p->nIter, p->nWrite, p->nDelete,
+ p->nValMin, p->nValMax
+ );
+}
+
+void test_data_3(
+ const char *zSystem, /* Database system name */
+ const char *zPattern, /* Run test cases that match this pattern */
+ int *pRc /* IN/OUT: Error code */
+){
+ Datatest3 aTest[] = {
+ /* nRange, nIter, nWrite, nDelete, nValMin, nValMax */
+ { 100, 1000, 5, 5, 50, 100 },
+ { 100, 1000, 2, 2, 5, 10 },
+ };
+
+ int i;
+
+ for(i=0; *pRc==LSM_OK && inRow++;
+ for(i=0; icksum1 += ((u8 *)pKey)[i];
+ p->cksum2 += p->cksum1;
+ }
+ for(i=0; icksum1 += ((u8 *)pVal)[i];
+ p->cksum2 += p->cksum1;
+ }
+}
+
+/*
+** tdb_scan() callback used by testCountDatabase()
+*/
+static void scanCountDb(
+ void *pCtx,
+ void *pKey, int nKey,
+ void *pVal, int nVal
+){
+ Cksum *p = (Cksum *)pCtx;
+ p->nRow++;
+
+ unused_parameter(pKey);
+ unused_parameter(nKey);
+ unused_parameter(pVal);
+ unused_parameter(nVal);
+}
+
+
+/*
+** Iterate through the entire contents of database pDb. Write a checksum
+** string based on the db contents into buffer zOut before returning. A
+** checksum string is at most 29 (TEST_CKSUM_BYTES) bytes in size:
+**
+** * 32-bit integer (10 bytes)
+** * 1 space (1 byte)
+** * 32-bit hex (8 bytes)
+** * 1 space (1 byte)
+** * 32-bit hex (8 bytes)
+** * nul-terminator (1 byte)
+**
+** The number of entries in the database is returned.
+*/
+int testCksumDatabase(
+ TestDb *pDb, /* Database handle */
+ char *zOut /* Buffer to write checksum to */
+){
+ Cksum cksum;
+ memset(&cksum, 0, sizeof(Cksum));
+ tdb_scan(pDb, (void *)&cksum, 0, 0, 0, 0, 0, scanCksumDb);
+ sprintf(zOut, "%d %x %x",
+ cksum.nRow, (u32)cksum.cksum1, (u32)cksum.cksum2
+ );
+ assert( strlen(zOut)0 ); */
+ if( testrc==0 ) testrc = lsm_checkpoint(db, 0);
+ }
+ tdb_close(pDb);
+
+ /* Check that the database content is still correct */
+ testCompareCksumLsmdb(DBNAME,
+ bCompress, testCksumArrayGet(pCksumDb, nRow), 0, pRc);
+ }
+
+ testCksumArrayFree(pCksumDb);
+ testDatasourceFree(pData);
+}
+
+/*
+** This test verifies that if a system crash occurs while committing a
+** transaction to the log file, no earlier transactions are lost or damaged.
+*/
+static void crash_test2(int bCompress, int *pRc){
+ const char *DBNAME = "testdb.lsm";
+ const DatasourceDefn defn = {TEST_DATASOURCE_RANDOM, 12, 16, 1000, 1000};
+
+ const int nIter = 200;
+ const int nInsert = 20;
+
+ int i;
+ int iDot = 0;
+ Datasource *pData;
+ CksumDb *pCksumDb;
+ TestDb *pDb;
+
+ /* Allocate datasource. And calculate the expected checksums. */
+ pData = testDatasourceNew(&defn);
+ pCksumDb = testCksumArrayNew(pData, 100, 100+nInsert, 1);
+
+ /* Setup and save the initial database. */
+ testSetupSavedLsmdb("", DBNAME, pData, 100, pRc);
+
+ for(i=0; izTest) ){
+ p->x(p->bCompress, pRc);
+ testCaseFinish(*pRc);
+ }
+ }
+}
diff --git a/ext/lsm1/lsm-test/lsmtest3.c b/ext/lsm1/lsm-test/lsmtest3.c
new file mode 100644
index 0000000..760dec3
--- /dev/null
+++ b/ext/lsm1/lsm-test/lsmtest3.c
@@ -0,0 +1,238 @@
+
+
+/*
+** This file contains tests related to the explicit rollback of database
+** transactions and sub-transactions.
+*/
+
+
+/*
+** Repeat 2000 times (until the db contains 100,000 entries):
+**
+** 1. Open a transaction and insert 500 rows, opening a nested
+** sub-transaction each 100 rows.
+**
+** 2. Roll back to each sub-transaction savepoint. Check the database
+** checksum looks Ok.
+**
+** 3. Every second iteration, roll back the main transaction. Check the
+** db checksum is correct. Every other iteration, commit the main
+** transaction (increasing the size of the db by 100 rows).
+*/
+
+
+#include "lsmtest.h"
+
+struct CksumDb {
+ int nFirst;
+ int nLast;
+ int nStep;
+ char **azCksum;
+};
+
+CksumDb *testCksumArrayNew(
+ Datasource *pData,
+ int nFirst,
+ int nLast,
+ int nStep
+){
+ TestDb *pDb;
+ CksumDb *pRet;
+ int i;
+ int nEntry;
+ int rc = 0;
+
+ assert( nLast>=nFirst && ((nLast-nFirst)%nStep)==0 );
+
+ pRet = malloc(sizeof(CksumDb));
+ memset(pRet, 0, sizeof(CksumDb));
+ pRet->nFirst = nFirst;
+ pRet->nLast = nLast;
+ pRet->nStep = nStep;
+ nEntry = 1 + ((nLast - nFirst) / nStep);
+
+ /* Allocate space so that azCksum is an array of nEntry pointers to
+ ** buffers each TEST_CKSUM_BYTES in size. */
+ pRet->azCksum = (char **)malloc(nEntry * (sizeof(char *) + TEST_CKSUM_BYTES));
+ for(i=0; iazCksum[nEntry]);
+ pRet->azCksum[i] = &pStart[i * TEST_CKSUM_BYTES];
+ }
+
+ tdb_open("lsm", "tempdb.lsm", 1, &pDb);
+ testWriteDatasourceRange(pDb, pData, 0, nFirst, &rc);
+ for(i=0; iazCksum[i]);
+ if( i==nEntry ) break;
+ testWriteDatasourceRange(pDb, pData, nFirst+i*nStep, nStep, &rc);
+ }
+
+ tdb_close(pDb);
+
+ return pRet;
+}
+
+char *testCksumArrayGet(CksumDb *p, int nRow){
+ int i;
+ assert( nRow>=p->nFirst );
+ assert( nRow<=p->nLast );
+ assert( ((nRow-p->nFirst) % p->nStep)==0 );
+
+ i = (nRow - p->nFirst) / p->nStep;
+ return p->azCksum[i];
+}
+
+void testCksumArrayFree(CksumDb *p){
+ free(p->azCksum);
+ memset(p, 0x55, sizeof(*p));
+ free(p);
+}
+
+/* End of CksumDb code.
+**************************************************************************/
+
+/*
+** Test utility function. Write key-value pair $i from datasource pData
+** into database pDb.
+*/
+void testWriteDatasource(TestDb *pDb, Datasource *pData, int i, int *pRc){
+ void *pKey; int nKey;
+ void *pVal; int nVal;
+ testDatasourceEntry(pData, i, &pKey, &nKey, &pVal, &nVal);
+ testWrite(pDb, pKey, nKey, pVal, nVal, pRc);
+}
+
+/*
+** Test utility function. Delete datasource pData key $i from database pDb.
+*/
+void testDeleteDatasource(TestDb *pDb, Datasource *pData, int i, int *pRc){
+ void *pKey; int nKey;
+ testDatasourceEntry(pData, i, &pKey, &nKey, 0, 0);
+ testDelete(pDb, pKey, nKey, pRc);
+}
+
+/*
+** This function inserts nWrite key/value pairs into database pDb - the
+** nWrite key value pairs starting at iFirst from data source pData.
+*/
+void testWriteDatasourceRange(
+ TestDb *pDb, /* Database to write to */
+ Datasource *pData, /* Data source to read values from */
+ int iFirst, /* Index of first key/value pair */
+ int nWrite, /* Number of key/value pairs to write */
+ int *pRc /* IN/OUT: Error code */
+){
+ int i;
+ for(i=0; i2 && rc==0; iTrans--){
+ tdb_rollback(pDb, iTrans);
+ nCurrent -= 100;
+ testCksumDatabase(pDb, zCksum);
+ testCompareStr(zCksum, testCksumArrayGet(pCksum, nCurrent), &rc);
+ }
+
+ if( i%2 ){
+ tdb_rollback(pDb, 0);
+ nCurrent -= 100;
+ testCksumDatabase(pDb, zCksum);
+ testCompareStr(zCksum, testCksumArrayGet(pCksum, nCurrent), &rc);
+ }else{
+ tdb_commit(pDb, 0);
+ }
+ }
+ testCaseFinish(rc);
+
+ skip_rollback_test:
+ tdb_close(pDb);
+ testCksumArrayFree(pCksum);
+ return rc;
+}
+
+void test_rollback(
+ const char *zSystem,
+ const char *zPattern,
+ int *pRc
+){
+ if( *pRc==0 ){
+ int bRun = 1;
+
+ if( zPattern ){
+ char *zName = getName(zSystem);
+ bRun = testGlobMatch(zPattern, zName);
+ testFree(zName);
+ }
+
+ if( bRun ){
+ DatasourceDefn defn = { TEST_DATASOURCE_RANDOM, 10, 15, 50, 100 };
+ Datasource *pData = testDatasourceNew(&defn);
+ *pRc = rollback_test_1(zSystem, pData);
+ testDatasourceFree(pData);
+ }
+ }
+}
diff --git a/ext/lsm1/lsm-test/lsmtest4.c b/ext/lsm1/lsm-test/lsmtest4.c
new file mode 100644
index 0000000..a47241d
--- /dev/null
+++ b/ext/lsm1/lsm-test/lsmtest4.c
@@ -0,0 +1,127 @@
+
+/*
+** This file contains test cases involving multiple database clients.
+*/
+
+#include "lsmtest.h"
+
+/*
+** The following code implements test cases "mc1.*".
+**
+** This test case uses one writer and $nReader readers. All connections
+** are driven by a single thread. All connections are opened at the start
+** of the test and remain open until the test is finished.
+**
+** The test consists of $nStep steps. Each step the following is performed:
+**
+** 1. The writer inserts $nWriteStep records into the db.
+**
+** 2. The writer checks that the contents of the db are as expected.
+**
+** 3. Each reader that currently has an open read transaction also checks
+** that the contents of the db are as expected (according to the snapshot
+** the read transaction is reading - see below).
+**
+** After step 1, reader 1 opens a read transaction. After step 2, reader
+** 2 opens a read transaction, and so on. At step ($nReader+1), reader 1
+** closes the current read transaction and opens a new one. And so on.
+** The result is that at step N (for N > $nReader), there exists a reader
+** with an open read transaction reading the snapshot committed following
+** steps (N-$nReader-1) to N.
+*/
+typedef struct Mctest Mctest;
+struct Mctest {
+ DatasourceDefn defn; /* Datasource to use */
+ int nStep; /* Total number of steps in test */
+ int nWriteStep; /* Number of rows to insert each step */
+ int nReader; /* Number of read connections */
+};
+static void do_mc_test(
+ const char *zSystem, /* Database system to test */
+ Mctest *pTest,
+ int *pRc /* IN/OUT: return code */
+){
+ const int nDomain = pTest->nStep * pTest->nWriteStep;
+ Datasource *pData; /* Source of data */
+ TestDb *pDb; /* First database connection (writer) */
+ int iReader; /* Used to iterate through aReader */
+ int iStep; /* Current step in test */
+ int iDot = 0; /* Current step in test */
+
+ /* Array of reader connections */
+ struct Reader {
+ TestDb *pDb; /* Connection handle */
+ int iLast; /* Current snapshot contains keys 0..iLast */
+ } *aReader;
+
+ /* Create a data source */
+ pData = testDatasourceNew(&pTest->defn);
+
+ /* Open the writer connection */
+ pDb = testOpen(zSystem, 1, pRc);
+
+ /* Allocate aReader */
+ aReader = (struct Reader *)testMalloc(sizeof(aReader[0]) * pTest->nReader);
+ for(iReader=0; iReadernReader; iReader++){
+ aReader[iReader].pDb = testOpen(zSystem, 0, pRc);
+ }
+
+ for(iStep=0; iStepnStep; iStep++){
+ int iLast;
+ int iBegin; /* Start read trans using aReader[iBegin] */
+
+ /* Insert nWriteStep more records into the database */
+ int iFirst = iStep*pTest->nWriteStep;
+ testWriteDatasourceRange(pDb, pData, iFirst, pTest->nWriteStep, pRc);
+
+ /* Check that the db is Ok according to the writer */
+ iLast = (iStep+1) * pTest->nWriteStep - 1;
+ testDbContents(pDb, pData, nDomain, 0, iLast, iLast, 1, pRc);
+
+ /* Have reader (iStep % nReader) open a read transaction here. */
+ iBegin = (iStep % pTest->nReader);
+ if( iBeginnReader && aReader[iReader].iLast; iReader++){
+ iLast = aReader[iReader].iLast;
+ testDbContents(
+ aReader[iReader].pDb, pData, nDomain, 0, iLast, iLast, 1, pRc
+ );
+ }
+
+ /* Report progress */
+ testCaseProgress(iStep, pTest->nStep, testCaseNDot(), &iDot);
+ }
+
+ /* Close all readers */
+ for(iReader=0; iReadernReader; iReader++){
+ testClose(&aReader[iReader].pDb);
+ }
+ testFree(aReader);
+
+ /* Close the writer-connection and free the datasource */
+ testClose(&pDb);
+ testDatasourceFree(pData);
+}
+
+
+void test_mc(
+ const char *zSystem, /* Database system name */
+ const char *zPattern, /* Run test cases that match this pattern */
+ int *pRc /* IN/OUT: Error code */
+){
+ int i;
+ Mctest aTest[] = {
+ { { TEST_DATASOURCE_RANDOM, 10,10, 100,100 }, 100, 10, 5 },
+ };
+
+ for(i=0; i "k.0000000045".
+**
+** As well as the key/value pairs, the database also contains checksum
+** entries. The checksums form a hierarchy - for every F key/value
+** entries there is one level 1 checksum. And for each F level 1 checksums
+** there is one level 2 checksum. And so on.
+**
+** Checksum keys are encoded as the two byte "c." followed by the
+** checksum level, followed by a 10 digit decimal number containing
+** the value of the first key that contributes to the checksum value.
+** For example, assuming F==10, the level 1 checksum that spans keys
+** 10 to 19 is "c.1.0000000010".
+**
+** Clients may perform one of two operations on the database: a read
+** or a write.
+**
+** READ OPERATIONS:
+**
+** A read operation scans a range of F key/value pairs. It computes
+** the expected checksum and then compares the computed value to the
+** actual value stored in the level 1 checksum entry. It then scans
+** the group of F level 1 checksums, and compares the computed checksum
+** to the associated level 2 checksum value, and so on until the
+** highest level checksum value has been verified.
+**
+** If a checksum ever fails to match the expected value, the test
+** has failed.
+**
+** WRITE OPERATIONS:
+**
+** A write operation involves writing (possibly clobbering) a single
+** key/value pair. The associated level 1 checksum is then recalculated
+** updated. Then the level 2 checksum, and so on until the highest
+** level checksum has been modified.
+**
+** All updates occur inside a single transaction.
+**
+** INTERFACE:
+**
+** The interface used by test cases to read and write the db consists
+** of type DbParameters and the following functions:
+**
+** dbReadOperation()
+** dbWriteOperation()
+*/
+
+#include "lsmtest.h"
+
+typedef struct DbParameters DbParameters;
+struct DbParameters {
+ int nFanout; /* Checksum fanout (F) */
+ int nKey; /* Size of key space (N) */
+};
+
+#define DB_KEY_BYTES (2+5+10+1)
+
+/*
+** Argument aBuf[] must point to a buffer at least DB_KEY_BYTES in size.
+** This function populates the buffer with a nul-terminated key string
+** corresponding to key iKey.
+*/
+static void dbFormatKey(
+ DbParameters *pParam,
+ int iLevel,
+ int iKey, /* Key value */
+ char *aBuf /* Write key string here */
+){
+ if( iLevel==0 ){
+ snprintf(aBuf, DB_KEY_BYTES, "k.%.10d", iKey);
+ }else{
+ int f = 1;
+ int i;
+ for(i=0; inFanout;
+ snprintf(aBuf, DB_KEY_BYTES, "c.%d.%.10d", iLevel, f*(iKey/f));
+ }
+}
+
+/*
+** Argument aBuf[] must point to a buffer at least DB_KEY_BYTES in size.
+** This function populates the buffer with the string representation of
+** checksum value iVal.
+*/
+static void dbFormatCksumValue(u32 iVal, char *aBuf){
+ snprintf(aBuf, DB_KEY_BYTES, "%.10u", iVal);
+}
+
+/*
+** Return the highest level of checksum in the database described
+** by *pParam.
+*/
+static int dbMaxLevel(DbParameters *pParam){
+ int iMax;
+ int n = 1;
+ for(iMax=0; nnKey; iMax++){
+ n = n * pParam->nFanout;
+ }
+ return iMax;
+}
+
+static void dbCksum(
+ void *pCtx, /* IN/OUT: Pointer to u32 containing cksum */
+ void *pKey, int nKey, /* Database key. Unused. */
+ void *pVal, int nVal /* Database value. Checksum this. */
+){
+ u8 *aVal = (u8 *)pVal;
+ u32 *pCksum = (u32 *)pCtx;
+ u32 cksum = *pCksum;
+ int i;
+
+ unused_parameter(pKey);
+ unused_parameter(nKey);
+
+ for(i=0; inFanout entries at level
+** iLevel-1.
+*/
+static u32 dbComputeCksum(
+ DbParameters *pParam, /* Database parameters */
+ TestDb *pDb, /* Database connection handle */
+ int iLevel, /* Level of checksum to compute */
+ int iKey, /* Compute checksum for this key */
+ int *pRc /* IN/OUT: Error code */
+){
+ u32 cksum = 0;
+ if( *pRc==0 ){
+ int nFirst;
+ int nLast;
+ int iFirst = 0;
+ int iLast = 0;
+ int i;
+ int f = 1;
+ char zFirst[DB_KEY_BYTES];
+ char zLast[DB_KEY_BYTES];
+
+ assert( iLevel>=1 );
+ for(i=0; inFanout;
+
+ iFirst = f*(iKey/f);
+ iLast = iFirst + f - 1;
+ dbFormatKey(pParam, iLevel-1, iFirst, zFirst);
+ dbFormatKey(pParam, iLevel-1, iLast, zLast);
+ nFirst = strlen(zFirst);
+ nLast = strlen(zLast);
+
+ *pRc = tdb_scan(pDb, (u32*)&cksum, 0, zFirst, nFirst, zLast, nLast,dbCksum);
+ }
+
+ return cksum;
+}
+
+static void dbReadOperation(
+ DbParameters *pParam, /* Database parameters */
+ TestDb *pDb, /* Database connection handle */
+ void (*xDelay)(void *),
+ void *pDelayCtx,
+ int iKey, /* Key to read */
+ int *pRc /* IN/OUT: Error code */
+){
+ const int iMax = dbMaxLevel(pParam);
+ int i;
+
+ if( tdb_transaction_support(pDb) ) testBegin(pDb, 1, pRc);
+ for(i=1; *pRc==0 && i<=iMax; i++){
+ char zCksum[DB_KEY_BYTES];
+ char zKey[DB_KEY_BYTES];
+ u32 iCksum = 0;
+
+ iCksum = dbComputeCksum(pParam, pDb, i, iKey, pRc);
+ if( iCksum ){
+ if( xDelay && i==1 ) xDelay(pDelayCtx);
+ dbFormatCksumValue(iCksum, zCksum);
+ dbFormatKey(pParam, i, iKey, zKey);
+ testFetchStr(pDb, zKey, zCksum, pRc);
+ }
+ }
+ if( tdb_transaction_support(pDb) ) testCommit(pDb, 0, pRc);
+}
+
+static int dbWriteOperation(
+ DbParameters *pParam, /* Database parameters */
+ TestDb *pDb, /* Database connection handle */
+ int iKey, /* Key to write to */
+ const char *zValue, /* Nul-terminated value to write */
+ int *pRc /* IN/OUT: Error code */
+){
+ const int iMax = dbMaxLevel(pParam);
+ char zKey[DB_KEY_BYTES];
+ int i;
+ int rc;
+
+ assert( iKey>=0 && iKeynKey );
+ dbFormatKey(pParam, 0, iKey, zKey);
+
+ /* Open a write transaction. This may fail - SQLITE4_BUSY */
+ if( *pRc==0 && tdb_transaction_support(pDb) ){
+ rc = tdb_begin(pDb, 2);
+ if( rc==5 ) return 0;
+ *pRc = rc;
+ }
+
+ testWriteStr(pDb, zKey, zValue, pRc);
+ for(i=1; i<=iMax; i++){
+ char zCksum[DB_KEY_BYTES];
+ u32 iCksum = 0;
+
+ iCksum = dbComputeCksum(pParam, pDb, i, iKey, pRc);
+ dbFormatCksumValue(iCksum, zCksum);
+ dbFormatKey(pParam, i, iKey, zKey);
+ testWriteStr(pDb, zKey, zCksum, pRc);
+ }
+ if( tdb_transaction_support(pDb) ) testCommit(pDb, 0, pRc);
+ return 1;
+}
+
+/*************************************************************************
+** The following block contains testXXX() functions that implement a
+** wrapper around the systems native multi-thread support. There are no
+** synchronization primitives - just functions to launch and join
+** threads. Wrapper functions are:
+**
+** testThreadSupport()
+**
+** testThreadInit()
+** testThreadShutdown()
+** testThreadLaunch()
+** testThreadWait()
+**
+** testThreadSetHalt()
+** testThreadGetHalt()
+** testThreadSetResult()
+** testThreadGetResult()
+**
+** testThreadEnterMutex()
+** testThreadLeaveMutex()
+*/
+typedef struct ThreadSet ThreadSet;
+#ifdef LSM_MUTEX_PTHREADS
+
+#include
+#include
+
+typedef struct Thread Thread;
+struct Thread {
+ int rc;
+ char *zMsg;
+ pthread_t id;
+ void (*xMain)(ThreadSet *, int, void *);
+ void *pCtx;
+ ThreadSet *pThreadSet;
+};
+
+struct ThreadSet {
+ int bHalt; /* Halt flag */
+ int nThread; /* Number of threads */
+ Thread *aThread; /* Array of Thread structures */
+ pthread_mutex_t mutex; /* Mutex used for cheating */
+};
+
+/*
+** Return true if this build supports threads, or false otherwise. If
+** this function returns false, no other testThreadXXX() functions should
+** be called.
+*/
+static int testThreadSupport(){ return 1; }
+
+/*
+** Allocate and return a thread-set handle with enough space allocated
+** to handle up to nMax threads. Each call to this function should be
+** matched by a call to testThreadShutdown() to delete the object.
+*/
+static ThreadSet *testThreadInit(int nMax){
+ int nByte; /* Total space to allocate */
+ ThreadSet *p; /* Return value */
+
+ nByte = sizeof(ThreadSet) + sizeof(struct Thread) * nMax;
+ p = (ThreadSet *)testMalloc(nByte);
+ p->nThread = nMax;
+ p->aThread = (Thread *)&p[1];
+ pthread_mutex_init(&p->mutex, 0);
+
+ return p;
+}
+
+/*
+** Delete a thread-set object and release all resources held by it.
+*/
+static void testThreadShutdown(ThreadSet *p){
+ int i;
+ for(i=0; inThread; i++){
+ testFree(p->aThread[i].zMsg);
+ }
+ pthread_mutex_destroy(&p->mutex);
+ testFree(p);
+}
+
+static void *ttMain(void *pArg){
+ Thread *pThread = (Thread *)pArg;
+ int iThread;
+ iThread = (pThread - pThread->pThreadSet->aThread);
+ pThread->xMain(pThread->pThreadSet, iThread, pThread->pCtx);
+ return 0;
+}
+
+/*
+** Launch a new thread.
+*/
+static int testThreadLaunch(
+ ThreadSet *p,
+ int iThread,
+ void (*xMain)(ThreadSet *, int, void *),
+ void *pCtx
+){
+ int rc;
+ Thread *pThread;
+
+ assert( iThread>=0 && iThreadnThread );
+
+ pThread = &p->aThread[iThread];
+ assert( pThread->pThreadSet==0 );
+ pThread->xMain = xMain;
+ pThread->pCtx = pCtx;
+ pThread->pThreadSet = p;
+ rc = pthread_create(&pThread->id, 0, ttMain, (void *)pThread);
+
+ return rc;
+}
+
+/*
+** Set the thread-set "halt" flag.
+*/
+static void testThreadSetHalt(ThreadSet *pThreadSet){
+ pThreadSet->bHalt = 1;
+}
+
+/*
+** Return the current value of the thread-set "halt" flag.
+*/
+static int testThreadGetHalt(ThreadSet *pThreadSet){
+ return pThreadSet->bHalt;
+}
+
+static void testThreadSleep(ThreadSet *pThreadSet, int nMs){
+ int nRem = nMs;
+ while( nRem>0 && testThreadGetHalt(pThreadSet)==0 ){
+ usleep(50000);
+ nRem -= 50;
+ }
+}
+
+/*
+** Wait for all threads launched to finish before returning. If nMs
+** is greater than zero, set the "halt" flag to tell all threads
+** to halt after waiting nMs milliseconds.
+*/
+static void testThreadWait(ThreadSet *pThreadSet, int nMs){
+ int i;
+
+ testThreadSleep(pThreadSet, nMs);
+ testThreadSetHalt(pThreadSet);
+ for(i=0; inThread; i++){
+ Thread *pThread = &pThreadSet->aThread[i];
+ if( pThread->xMain ){
+ pthread_join(pThread->id, 0);
+ }
+ }
+}
+
+/*
+** Set the result for thread iThread.
+*/
+static void testThreadSetResult(
+ ThreadSet *pThreadSet, /* Thread-set handle */
+ int iThread, /* Set result for this thread */
+ int rc, /* Result error code */
+ char *zFmt, /* Result string format */
+ ... /* Result string formatting args... */
+){
+ va_list ap;
+
+ testFree(pThreadSet->aThread[iThread].zMsg);
+ pThreadSet->aThread[iThread].rc = rc;
+ pThreadSet->aThread[iThread].zMsg = 0;
+ if( zFmt ){
+ va_start(ap, zFmt);
+ pThreadSet->aThread[iThread].zMsg = testMallocVPrintf(zFmt, ap);
+ va_end(ap);
+ }
+}
+
+/*
+** Retrieve the result for thread iThread.
+*/
+static int testThreadGetResult(
+ ThreadSet *pThreadSet, /* Thread-set handle */
+ int iThread, /* Get result for this thread */
+ const char **pzRes /* OUT: Pointer to result string */
+){
+ if( pzRes ) *pzRes = pThreadSet->aThread[iThread].zMsg;
+ return pThreadSet->aThread[iThread].rc;
+}
+
+/*
+** Enter and leave the test case mutex.
+*/
+#if 0
+static void testThreadEnterMutex(ThreadSet *p){
+ pthread_mutex_lock(&p->mutex);
+}
+static void testThreadLeaveMutex(ThreadSet *p){
+ pthread_mutex_unlock(&p->mutex);
+}
+#endif
+#endif
+
+#if !defined(LSM_MUTEX_PTHREADS)
+static int testThreadSupport(){ return 0; }
+
+#define testThreadInit(a) 0
+#define testThreadShutdown(a)
+#define testThreadLaunch(a,b,c,d) 0
+#define testThreadWait(a,b)
+#define testThreadSetHalt(a)
+#define testThreadGetHalt(a) 0
+#define testThreadGetResult(a,b,c) 0
+#define testThreadSleep(a,b) 0
+
+static void testThreadSetResult(ThreadSet *a, int b, int c, char *d, ...){
+ unused_parameter(a);
+ unused_parameter(b);
+ unused_parameter(c);
+ unused_parameter(d);
+}
+#endif
+/* End of threads wrapper.
+*************************************************************************/
+
+/*************************************************************************
+** Below this point is the third part of this file - the implementation
+** of the mt1.* tests.
+*/
+typedef struct Mt1Test Mt1Test;
+struct Mt1Test {
+ DbParameters param; /* Description of database to read/write */
+ int nReadwrite; /* Number of read/write threads */
+ int nFastReader; /* Number of fast reader threads */
+ int nSlowReader; /* Number of slow reader threads */
+ int nMs; /* How long to run for */
+ const char *zSystem; /* Database system to test */
+};
+
+typedef struct Mt1DelayCtx Mt1DelayCtx;
+struct Mt1DelayCtx {
+ ThreadSet *pSet; /* Threadset to sleep within */
+ int nMs; /* Sleep in ms */
+};
+
+static void xMt1Delay(void *pCtx){
+ Mt1DelayCtx *p = (Mt1DelayCtx *)pCtx;
+ testThreadSleep(p->pSet, p->nMs);
+}
+
+#define MT1_THREAD_RDWR 0
+#define MT1_THREAD_SLOW 1
+#define MT1_THREAD_FAST 2
+
+static void xMt1Work(lsm_db *pDb, void *pCtx){
+#if 0
+ char *z = 0;
+ lsm_info(pDb, LSM_INFO_DB_STRUCTURE, &z);
+ printf("%s\n", z);
+ fflush(stdout);
+#endif
+}
+
+/*
+** This is the main() proc for all threads in test case "mt1".
+*/
+static void mt1Main(ThreadSet *pThreadSet, int iThread, void *pCtx){
+ Mt1Test *p = (Mt1Test *)pCtx; /* Test parameters */
+ Mt1DelayCtx delay;
+ int nRead = 0; /* Number of calls to dbReadOperation() */
+ int nWrite = 0; /* Number of completed database writes */
+ int rc = 0; /* Error code */
+ int iPrng; /* Prng argument variable */
+ TestDb *pDb; /* Database handle */
+ int eType;
+
+ delay.pSet = pThreadSet;
+ delay.nMs = 0;
+ if( iThreadnReadwrite ){
+ eType = MT1_THREAD_RDWR;
+ }else if( iThread<(p->nReadwrite+p->nFastReader) ){
+ eType = MT1_THREAD_FAST;
+ }else{
+ eType = MT1_THREAD_SLOW;
+ delay.nMs = (p->nMs / 20);
+ }
+
+ /* Open a new database connection. Initialize the pseudo-random number
+ ** argument based on the thread number. */
+ iPrng = testPrngValue(iThread);
+ pDb = testOpen(p->zSystem, 0, &rc);
+
+ if( rc==0 ){
+ tdb_lsm_config_work_hook(pDb, xMt1Work, 0);
+ }
+
+ /* Loop until either an error occurs or some other thread sets the
+ ** halt flag. */
+ while( rc==0 && testThreadGetHalt(pThreadSet)==0 ){
+ int iKey;
+
+ /* Perform a read operation on an arbitrarily selected key. */
+ iKey = (testPrngValue(iPrng++) % p->param.nKey);
+ dbReadOperation(&p->param, pDb, xMt1Delay, (void *)&delay, iKey, &rc);
+ if( rc ) continue;
+ nRead++;
+
+ /* Attempt to write an arbitrary key value pair (and update the associated
+ ** checksum entries). dbWriteOperation() returns 1 if the write is
+ ** successful, or 0 if it failed with an LSM_BUSY error. */
+ if( eType==MT1_THREAD_RDWR ){
+ char aValue[50];
+ char aRnd[25];
+
+ iKey = (testPrngValue(iPrng++) % p->param.nKey);
+ testPrngString(iPrng, aRnd, sizeof(aRnd));
+ iPrng += sizeof(aRnd);
+ snprintf(aValue, sizeof(aValue), "%d.%s", iThread, aRnd);
+ nWrite += dbWriteOperation(&p->param, pDb, iKey, aValue, &rc);
+ }
+ }
+ testClose(&pDb);
+
+ /* If an error has occured, set the thread error code and the threadset
+ ** halt flag to tell the other test threads to halt. Otherwise, set the
+ ** thread error code to 0 and post a message with the number of read
+ ** and write operations completed. */
+ if( rc ){
+ testThreadSetResult(pThreadSet, iThread, rc, 0);
+ testThreadSetHalt(pThreadSet);
+ }else{
+ testThreadSetResult(pThreadSet, iThread, 0, "r/w: %d/%d", nRead, nWrite);
+ }
+}
+
+static void do_test_mt1(
+ const char *zSystem, /* Database system name */
+ const char *zPattern, /* Run test cases that match this pattern */
+ int *pRc /* IN/OUT: Error code */
+){
+ Mt1Test aTest[] = {
+ /* param, nReadwrite, nFastReader, nSlowReader, nMs, zSystem */
+ { {10, 1000}, 4, 0, 0, 10000, 0 },
+ { {10, 1000}, 4, 4, 2, 100000, 0 },
+ { {10, 100000}, 4, 0, 0, 10000, 0 },
+ { {10, 100000}, 4, 4, 2, 100000, 0 },
+ };
+ int i;
+
+ for(i=0; *pRc==0 && iparam.nFanout, p->param.nKey,
+ p->nMs, p->nReadwrite, p->nFastReader, p->nSlowReader
+ );
+ if( bRun ){
+ TestDb *pDb;
+ ThreadSet *pSet;
+ int iThread;
+ int nThread;
+
+ p->zSystem = zSystem;
+ pDb = testOpen(zSystem, 1, pRc);
+
+ nThread = p->nReadwrite + p->nFastReader + p->nSlowReader;
+ pSet = testThreadInit(nThread);
+ for(iThread=0; *pRc==0 && iThreadnMs);
+ for(iThread=0; *pRc==0 && iThreadiNext = 1;
+ p->bEnable = 1;
+ p->nFail = 1;
+ p->pEnv = tdb_lsm_env();
+}
+
+static void xOomHook(OomTest *p){
+ p->nFail++;
+}
+
+static int testOomContinue(OomTest *p){
+ if( p->rc!=0 || (p->iNext>1 && p->nFail==0) ){
+ return 0;
+ }
+ p->nFail = 0;
+ testMallocOom(p->pEnv, p->iNext, 0, (void (*)(void*))xOomHook, (void *)p);
+ return 1;
+}
+
+static void testOomEnable(OomTest *p, int bEnable){
+ p->bEnable = bEnable;
+ testMallocOomEnable(p->pEnv, bEnable);
+}
+
+static void testOomNext(OomTest *p){
+ p->iNext++;
+}
+
+static int testOomHit(OomTest *p){
+ return (p->nFail>0);
+}
+
+static int testOomFinish(OomTest *p){
+ return p->rc;
+}
+
+static void testOomAssert(OomTest *p, int bVal){
+ if( bVal==0 ){
+ test_failed();
+ p->rc = 1;
+ }
+}
+
+/*
+** Test that the error code matches the state of the OomTest object passed
+** as the first argument. Specifically, check that rc is LSM_NOMEM if an
+** OOM error has already been injected, or LSM_OK if not.
+*/
+static void testOomAssertRc(OomTest *p, int rc){
+ testOomAssert(p, rc==LSM_OK || rc==LSM_NOMEM);
+ testOomAssert(p, testOomHit(p)==(rc==LSM_NOMEM) || p->bEnable==0 );
+}
+
+static void testOomOpen(
+ OomTest *pOom,
+ const char *zName,
+ lsm_db **ppDb,
+ int *pRc
+){
+ if( *pRc==LSM_OK ){
+ int rc;
+ rc = lsm_new(tdb_lsm_env(), ppDb);
+ if( rc==LSM_OK ) rc = lsm_open(*ppDb, zName);
+ testOomAssertRc(pOom, rc);
+ *pRc = rc;
+ }
+}
+
+static void testOomFetch(
+ OomTest *pOom,
+ lsm_db *pDb,
+ void *pKey, int nKey,
+ void *pVal, int nVal,
+ int *pRc
+){
+ testOomAssertRc(pOom, *pRc);
+ if( *pRc==LSM_OK ){
+ lsm_cursor *pCsr;
+ int rc;
+
+ rc = lsm_csr_open(pDb, &pCsr);
+ if( rc==LSM_OK ) rc = lsm_csr_seek(pCsr, pKey, nKey, 0);
+ testOomAssertRc(pOom, rc);
+
+ if( rc==LSM_OK ){
+ const void *p; int n;
+ testOomAssert(pOom, lsm_csr_valid(pCsr));
+
+ rc = lsm_csr_key(pCsr, &p, &n);
+ testOomAssertRc(pOom, rc);
+ testOomAssert(pOom, rc!=LSM_OK || (n==nKey && memcmp(pKey, p, nKey)==0) );
+ }
+
+ if( rc==LSM_OK ){
+ const void *p; int n;
+ testOomAssert(pOom, lsm_csr_valid(pCsr));
+
+ rc = lsm_csr_value(pCsr, &p, &n);
+ testOomAssertRc(pOom, rc);
+ testOomAssert(pOom, rc!=LSM_OK || (n==nVal && memcmp(pVal, p, nVal)==0) );
+ }
+
+ lsm_csr_close(pCsr);
+ *pRc = rc;
+ }
+}
+
+static void testOomWrite(
+ OomTest *pOom,
+ lsm_db *pDb,
+ void *pKey, int nKey,
+ void *pVal, int nVal,
+ int *pRc
+){
+ testOomAssertRc(pOom, *pRc);
+ if( *pRc==LSM_OK ){
+ int rc;
+
+ rc = lsm_insert(pDb, pKey, nKey, pVal, nVal);
+ testOomAssertRc(pOom, rc);
+
+ *pRc = rc;
+ }
+}
+
+
+static void testOomFetchStr(
+ OomTest *pOom,
+ lsm_db *pDb,
+ const char *zKey,
+ const char *zVal,
+ int *pRc
+){
+ int nKey = strlen(zKey);
+ int nVal = strlen(zVal);
+ testOomFetch(pOom, pDb, (void *)zKey, nKey, (void *)zVal, nVal, pRc);
+}
+
+static void testOomFetchData(
+ OomTest *pOom,
+ lsm_db *pDb,
+ Datasource *pData,
+ int iKey,
+ int *pRc
+){
+ void *pKey; int nKey;
+ void *pVal; int nVal;
+ testDatasourceEntry(pData, iKey, &pKey, &nKey, &pVal, &nVal);
+ testOomFetch(pOom, pDb, pKey, nKey, pVal, nVal, pRc);
+}
+
+static void testOomWriteStr(
+ OomTest *pOom,
+ lsm_db *pDb,
+ const char *zKey,
+ const char *zVal,
+ int *pRc
+){
+ int nKey = strlen(zKey);
+ int nVal = strlen(zVal);
+ testOomWrite(pOom, pDb, (void *)zKey, nKey, (void *)zVal, nVal, pRc);
+}
+
+static void testOomWriteData(
+ OomTest *pOom,
+ lsm_db *pDb,
+ Datasource *pData,
+ int iKey,
+ int *pRc
+){
+ void *pKey; int nKey;
+ void *pVal; int nVal;
+ testDatasourceEntry(pData, iKey, &pKey, &nKey, &pVal, &nVal);
+ testOomWrite(pOom, pDb, pKey, nKey, pVal, nVal, pRc);
+}
+
+static void testOomScan(
+ OomTest *pOom,
+ lsm_db *pDb,
+ int bReverse,
+ const void *pKey, int nKey,
+ int nScan,
+ int *pRc
+){
+ if( *pRc==0 ){
+ int rc;
+ int iScan = 0;
+ lsm_cursor *pCsr;
+ int (*xAdvance)(lsm_cursor *) = 0;
+
+
+ rc = lsm_csr_open(pDb, &pCsr);
+ testOomAssertRc(pOom, rc);
+
+ if( rc==LSM_OK ){
+ if( bReverse ){
+ rc = lsm_csr_seek(pCsr, pKey, nKey, LSM_SEEK_LE);
+ xAdvance = lsm_csr_prev;
+ }else{
+ rc = lsm_csr_seek(pCsr, pKey, nKey, LSM_SEEK_GE);
+ xAdvance = lsm_csr_next;
+ }
+ }
+ testOomAssertRc(pOom, rc);
+
+ while( rc==LSM_OK && lsm_csr_valid(pCsr) && iScan "one"
+** "two" -> "four"
+** "three" -> "nine"
+** "four" -> "sixteen"
+** "five" -> "twentyfive"
+** "six" -> "thirtysix"
+** "seven" -> "fourtynine"
+** "eight" -> "sixtyfour"
+*/
+static void setup_populate_db(void){
+ const char *azStr[] = {
+ "one", "one",
+ "two", "four",
+ "three", "nine",
+ "four", "sixteen",
+ "five", "twentyfive",
+ "six", "thirtysix",
+ "seven", "fourtynine",
+ "eight", "sixtyfour",
+ };
+ int rc;
+ int ii;
+ lsm_db *pDb;
+
+ testDeleteLsmdb(LSMTEST6_TESTDB);
+
+ rc = lsm_new(tdb_lsm_env(), &pDb);
+ if( rc==LSM_OK ) rc = lsm_open(pDb, LSMTEST6_TESTDB);
+
+ for(ii=0; rc==LSM_OK && iiiInsStart, pStep->nIns, pRc);
+ testDeleteDatasourceRange(pDb, pData, pStep->iDelStart, pStep->nDel, pRc);
+ if( *pRc==0 ){
+ int nSave = -1;
+ int nBuf = 64;
+ lsm_db *db = tdb_lsm(pDb);
+
+ lsm_config(db, LSM_CONFIG_AUTOFLUSH, &nSave);
+ lsm_config(db, LSM_CONFIG_AUTOFLUSH, &nBuf);
+ lsm_begin(db, 1);
+ lsm_commit(db, 0);
+ lsm_config(db, LSM_CONFIG_AUTOFLUSH, &nSave);
+
+ *pRc = lsm_work(db, 0, 0, 0);
+ if( *pRc==0 ){
+ *pRc = lsm_checkpoint(db, 0);
+ }
+ }
+}
+
+static void doSetupStepArray(
+ TestDb *pDb,
+ Datasource *pData,
+ const SetupStep *aStep,
+ int nStep
+){
+ int i;
+ for(i=0; i
+void testReadFile(const char *zFile, int iOff, void *pOut, int nByte, int *pRc){
+ if( *pRc==0 ){
+ FILE *fd;
+ fd = fopen(zFile, "rb");
+ if( fd==0 ){
+ *pRc = 1;
+ }else{
+ if( 0!=fseek(fd, iOff, SEEK_SET) ){
+ *pRc = 1;
+ }else{
+ assert( nByte>=0 );
+ if( (size_t)nByte!=fread(pOut, 1, nByte, fd) ){
+ *pRc = 1;
+ }
+ }
+ fclose(fd);
+ }
+ }
+}
+
+void testWriteFile(
+ const char *zFile,
+ int iOff,
+ void *pOut,
+ int nByte,
+ int *pRc
+){
+ if( *pRc==0 ){
+ FILE *fd;
+ fd = fopen(zFile, "r+b");
+ if( fd==0 ){
+ *pRc = 1;
+ }else{
+ if( 0!=fseek(fd, iOff, SEEK_SET) ){
+ *pRc = 1;
+ }else{
+ assert( nByte>=0 );
+ if( (size_t)nByte!=fwrite(pOut, 1, nByte, fd) ){
+ *pRc = 1;
+ }
+ }
+ fclose(fd);
+ }
+ }
+}
+
+static ShmHeader *getShmHeader(const char *zDb){
+ int rc = 0;
+ char *zShm = testMallocPrintf("%s-shm", zDb);
+ ShmHeader *pHdr;
+
+ pHdr = testMalloc(sizeof(ShmHeader));
+ testReadFile(zShm, 0, (void *)pHdr, sizeof(ShmHeader), &rc);
+ assert( rc==0 );
+
+ return pHdr;
+}
+
+/*
+** This function makes a copy of the three files associated with LSM
+** database zDb (i.e. if zDb is "test.db", it makes copies of "test.db",
+** "test.db-log" and "test.db-shm").
+**
+** It then opens a new database connection to the copy with the xLock() call
+** instrumented so that it appears that some other process already connected
+** to the db (holding a shared lock on DMS2). This prevents recovery from
+** running. Then:
+**
+** 1) Check that the checksum of the database is zCksum.
+** 2) Write a few keys to the database. Then delete the same keys.
+** 3) Check that the checksum is zCksum.
+** 4) Flush the db to disk and run a checkpoint.
+** 5) Check once more that the checksum is still zCksum.
+*/
+static void doLiveRecovery(const char *zDb, const char *zCksum, int *pRc){
+ if( *pRc==LSM_OK ){
+ const DatasourceDefn defn = {TEST_DATASOURCE_RANDOM, 20, 25, 100, 500};
+ Datasource *pData;
+ const char *zCopy = "testcopy.lsm";
+ char zCksum2[TEST_CKSUM_BYTES];
+ TestDb *pDb = 0;
+ int rc;
+
+ pData = testDatasourceNew(&defn);
+
+ testCopyLsmdb(zDb, zCopy);
+ rc = tdb_lsm_open("test_no_recovery=1", zCopy, 0, &pDb);
+ if( rc==0 ){
+ ShmHeader *pHdr;
+ lsm_db *db;
+ testCksumDatabase(pDb, zCksum2);
+ testCompareStr(zCksum, zCksum2, &rc);
+
+ testWriteDatasourceRange(pDb, pData, 1, 10, &rc);
+ testDeleteDatasourceRange(pDb, pData, 1, 10, &rc);
+
+ /* Test that the two tree-headers are now consistent. */
+ pHdr = getShmHeader(zCopy);
+ if( rc==0 && memcmp(&pHdr->hdr1, &pHdr->hdr2, sizeof(pHdr->hdr1)) ){
+ rc = 1;
+ }
+ testFree(pHdr);
+
+ if( rc==0 ){
+ int nBuf = 64;
+ db = tdb_lsm(pDb);
+ lsm_config(db, LSM_CONFIG_AUTOFLUSH, &nBuf);
+ lsm_begin(db, 1);
+ lsm_commit(db, 0);
+ rc = lsm_work(db, 0, 0, 0);
+ }
+
+ testCksumDatabase(pDb, zCksum2);
+ testCompareStr(zCksum, zCksum2, &rc);
+ }
+
+ testDatasourceFree(pData);
+ testClose(&pDb);
+ testDeleteLsmdb(zCopy);
+ *pRc = rc;
+ }
+}
+
+static void doWriterCrash1(int *pRc){
+ const int nWrite = 2000;
+ const int nStep = 10;
+ const int iWriteStart = 20000;
+ int rc = 0;
+ TestDb *pDb = 0;
+ Datasource *pData = 0;
+
+ rc = tdb_lsm_open("autowork=0", "testdb.lsm", 1, &pDb);
+ if( rc==0 ){
+ int iDot = 0;
+ char zCksum[TEST_CKSUM_BYTES];
+ int i;
+ setupDatabase1(pDb, &pData);
+ testCksumDatabase(pDb, zCksum);
+ testBegin(pDb, 2, &rc);
+ for(i=0; rc==0 && ihdr1, &pHdr1->hdr1, sizeof(pHdr1->hdr1));
+ pHdr2->bWriter = 1;
+ testWriteFile("testdb.lsm-shm", 0, (void *)pHdr2, sizeof(ShmHeader), &rc);
+ doLiveRecovery("testdb.lsm", zCksum1, &rc);
+
+ /* If both tree-headers are valid, tree-header-1 is used. */
+ memcpy(&pHdr2->hdr1, &pHdr2->hdr2, sizeof(pHdr1->hdr1));
+ memcpy(&pHdr2->hdr2, &pHdr1->hdr1, sizeof(pHdr1->hdr1));
+ pHdr2->bWriter = 1;
+ testWriteFile("testdb.lsm-shm", 0, (void *)pHdr2, sizeof(ShmHeader), &rc);
+ doLiveRecovery("testdb.lsm", zCksum2, &rc);
+
+ /* If tree-header 1 is invalid, tree-header-2 is used */
+ memcpy(&pHdr2->hdr2, &pHdr2->hdr1, sizeof(pHdr1->hdr1));
+ pHdr2->hdr1.aCksum[0] = 5;
+ pHdr2->hdr1.aCksum[0] = 6;
+ pHdr2->bWriter = 1;
+ testWriteFile("testdb.lsm-shm", 0, (void *)pHdr2, sizeof(ShmHeader), &rc);
+ doLiveRecovery("testdb.lsm", zCksum2, &rc);
+
+ /* If tree-header 2 is invalid, tree-header-1 is used */
+ memcpy(&pHdr2->hdr1, &pHdr2->hdr2, sizeof(pHdr1->hdr1));
+ pHdr2->hdr2.aCksum[0] = 5;
+ pHdr2->hdr2.aCksum[0] = 6;
+ pHdr2->bWriter = 1;
+ testWriteFile("testdb.lsm-shm", 0, (void *)pHdr2, sizeof(ShmHeader), &rc);
+ doLiveRecovery("testdb.lsm", zCksum2, &rc);
+
+ testFree(pHdr1);
+ testFree(pHdr2);
+ testClose(&pDb);
+ }
+
+ *pRc = rc;
+}
+
+void do_writer_crash_test(const char *zPattern, int *pRc){
+ struct Test {
+ const char *zName;
+ void (*xFunc)(int *);
+ } aTest[] = {
+ { "writercrash1.lsm", doWriterCrash1 },
+ { "writercrash2.lsm", doWriterCrash2 },
+ };
+ int i;
+ for(i=0; izName) ){
+ p->xFunc(pRc);
+ testCaseFinish(*pRc);
+ }
+ }
+
+}
+
+
diff --git a/ext/lsm1/lsm-test/lsmtest9.c b/ext/lsm1/lsm-test/lsmtest9.c
new file mode 100644
index 0000000..144cae7
--- /dev/null
+++ b/ext/lsm1/lsm-test/lsmtest9.c
@@ -0,0 +1,143 @@
+
+#include "lsmtest.h"
+
+#define DATA_SEQUENTIAL TEST_DATASOURCE_SEQUENCE
+#define DATA_RANDOM TEST_DATASOURCE_RANDOM
+
+typedef struct Datatest4 Datatest4;
+
+/*
+** Test overview:
+**
+** 1. Insert (Datatest4.nRec) records into a database.
+**
+** 2. Repeat (Datatest4.nRepeat) times:
+**
+** 2a. Delete 2/3 of the records in the database.
+**
+** 2b. Run lsm_work(nMerge=1).
+**
+** 2c. Insert as many records as were deleted in 2a.
+**
+** 2d. Check database content is as expected.
+**
+** 2e. If (Datatest4.bReopen) is true, close and reopen the database.
+*/
+struct Datatest4 {
+ /* Datasource definition */
+ DatasourceDefn defn;
+
+ int nRec;
+ int nRepeat;
+ int bReopen;
+};
+
+static void doDataTest4(
+ const char *zSystem, /* Database system to test */
+ Datatest4 *p, /* Structure containing test parameters */
+ int *pRc /* OUT: Error code */
+){
+ lsm_db *db = 0;
+ TestDb *pDb;
+ TestDb *pControl;
+ Datasource *pData;
+ int i;
+ int rc = 0;
+ int iDot = 0;
+ int bMultiThreaded = 0; /* True for MT LSM database */
+
+ int nRecOn3 = (p->nRec / 3);
+ int iData = 0;
+
+ /* Start the test case, open a database and allocate the datasource. */
+ rc = testControlDb(&pControl);
+ pDb = testOpen(zSystem, 1, &rc);
+ pData = testDatasourceNew(&p->defn);
+ if( rc==0 ){
+ db = tdb_lsm(pDb);
+ bMultiThreaded = tdb_lsm_multithread(pDb);
+ }
+
+ testWriteDatasourceRange(pControl, pData, iData, nRecOn3*3, &rc);
+ testWriteDatasourceRange(pDb, pData, iData, nRecOn3*3, &rc);
+
+ for(i=0; rc==0 && inRepeat; i++){
+
+ testDeleteDatasourceRange(pControl, pData, iData, nRecOn3*2, &rc);
+ testDeleteDatasourceRange(pDb, pData, iData, nRecOn3*2, &rc);
+
+ if( db ){
+ int nDone;
+#if 0
+ fprintf(stderr, "lsm_work() start...\n"); fflush(stderr);
+#endif
+ do {
+ nDone = 0;
+ rc = lsm_work(db, 1, (1<<30), &nDone);
+ }while( rc==0 && nDone>0 );
+ if( bMultiThreaded && rc==LSM_BUSY ) rc = LSM_OK;
+#if 0
+ fprintf(stderr, "lsm_work() done...\n"); fflush(stderr);
+#endif
+ }
+
+if( i+1nRepeat ){
+ iData += (nRecOn3*2);
+ testWriteDatasourceRange(pControl, pData, iData+nRecOn3, nRecOn3*2, &rc);
+ testWriteDatasourceRange(pDb, pData, iData+nRecOn3, nRecOn3*2, &rc);
+
+ testCompareDb(pData, nRecOn3*3, iData, pControl, pDb, &rc);
+
+ /* If Datatest4.bReopen is true, close and reopen the database */
+ if( p->bReopen ){
+ testReopen(&pDb, &rc);
+ if( rc==0 ) db = tdb_lsm(pDb);
+ }
+}
+
+ /* Update the progress dots... */
+ testCaseProgress(i, p->nRepeat, testCaseNDot(), &iDot);
+ }
+
+ testClose(&pDb);
+ testClose(&pControl);
+ testDatasourceFree(pData);
+ testCaseFinish(rc);
+ *pRc = rc;
+}
+
+static char *getName4(const char *zSystem, Datatest4 *pTest){
+ char *zRet;
+ char *zData;
+ zData = testDatasourceName(&pTest->defn);
+ zRet = testMallocPrintf("data4.%s.%s.%d.%d.%d",
+ zSystem, zData, pTest->nRec, pTest->nRepeat, pTest->bReopen
+ );
+ testFree(zData);
+ return zRet;
+}
+
+void test_data_4(
+ const char *zSystem, /* Database system name */
+ const char *zPattern, /* Run test cases that match this pattern */
+ int *pRc /* IN/OUT: Error code */
+){
+ Datatest4 aTest[] = {
+ /* defn, nRec, nRepeat, bReopen */
+ { {DATA_RANDOM, 20,25, 500,600}, 10000, 10, 0 },
+ { {DATA_RANDOM, 20,25, 500,600}, 10000, 10, 1 },
+ };
+
+ int i;
+
+ for(i=0; *pRc==LSM_OK && ieType ){
+ case TEST_DATASOURCE_RANDOM: {
+ int nRange = (1 + p->nMaxKey - p->nMinKey);
+ nKey = (int)( testPrngValue((u32)iData) % nRange ) + p->nMinKey;
+ testPrngString((u32)iData, p->aKey, nKey);
+ break;
+ }
+ case TEST_DATASOURCE_SEQUENCE:
+ nKey = sprintf(p->aKey, "%012d", iData);
+ break;
+ }
+ *ppKey = p->aKey;
+ *pnKey = nKey;
+ }
+ if( ppVal ){
+ u32 nVal = testPrngValue((u32)iData)%(1+p->nMaxVal-p->nMinVal)+p->nMinVal;
+ testPrngString((u32)~iData, p->aVal, (int)nVal);
+ *ppVal = p->aVal;
+ *pnVal = (int)nVal;
+ }
+}
+
+void testDatasourceFree(Datasource *p){
+ testFree(p);
+}
+
+/*
+** Return a pointer to a nul-terminated string that corresponds to the
+** contents of the datasource-definition passed as the first argument.
+** The caller should eventually free the returned pointer using testFree().
+*/
+char *testDatasourceName(const DatasourceDefn *p){
+ char *zRet;
+ zRet = testMallocPrintf("%s.(%d-%d).(%d-%d)",
+ (p->eType==TEST_DATASOURCE_SEQUENCE ? "seq" : "rnd"),
+ p->nMinKey, p->nMaxKey,
+ p->nMinVal, p->nMaxVal
+ );
+ return zRet;
+}
+
+Datasource *testDatasourceNew(const DatasourceDefn *pDefn){
+ Datasource *p;
+ int nMinKey;
+ int nMaxKey;
+ int nMinVal;
+ int nMaxVal;
+
+ if( pDefn->eType==TEST_DATASOURCE_SEQUENCE ){
+ nMinKey = 128;
+ nMaxKey = 128;
+ }else{
+ nMinKey = MAX(0, pDefn->nMinKey);
+ nMaxKey = MAX(nMinKey, pDefn->nMaxKey);
+ }
+ nMinVal = MAX(0, pDefn->nMinVal);
+ nMaxVal = MAX(nMinVal, pDefn->nMaxVal);
+
+ p = (Datasource *)testMalloc(sizeof(Datasource) + nMaxKey + nMaxVal + 1);
+ p->eType = pDefn->eType;
+ p->nMinKey = nMinKey;
+ p->nMinVal = nMinVal;
+ p->nMaxKey = nMaxKey;
+ p->nMaxVal = nMaxVal;
+
+ p->aKey = (char *)&p[1];
+ p->aVal = &p->aKey[nMaxKey];
+ return p;
+};
diff --git a/ext/lsm1/lsm-test/lsmtest_func.c b/ext/lsm1/lsm-test/lsmtest_func.c
new file mode 100644
index 0000000..eb8346a
--- /dev/null
+++ b/ext/lsm1/lsm-test/lsmtest_func.c
@@ -0,0 +1,177 @@
+
+#include "lsmtest.h"
+
+
+int do_work(int nArg, char **azArg){
+ struct Option {
+ const char *zName;
+ } aOpt [] = {
+ { "-nmerge" },
+ { "-nkb" },
+ { 0 }
+ };
+
+ lsm_db *pDb;
+ int rc;
+ int i;
+ const char *zDb;
+ int nMerge = 1;
+ int nKB = (1<<30);
+
+ if( nArg==0 ) goto usage;
+ zDb = azArg[nArg-1];
+ for(i=0; i<(nArg-1); i++){
+ int iSel;
+ rc = testArgSelect(aOpt, "option", azArg[i], &iSel);
+ if( rc ) return rc;
+ switch( iSel ){
+ case 0:
+ i++;
+ if( i==(nArg-1) ) goto usage;
+ nMerge = atoi(azArg[i]);
+ break;
+ case 1:
+ i++;
+ if( i==(nArg-1) ) goto usage;
+ nKB = atoi(azArg[i]);
+ break;
+ }
+ }
+
+ rc = lsm_new(0, &pDb);
+ if( rc!=LSM_OK ){
+ testPrintError("lsm_open(): rc=%d\n", rc);
+ }else{
+ rc = lsm_open(pDb, zDb);
+ if( rc!=LSM_OK ){
+ testPrintError("lsm_open(): rc=%d\n", rc);
+ }else{
+ int n = -1;
+ lsm_config(pDb, LSM_CONFIG_BLOCK_SIZE, &n);
+ n = n*2;
+ lsm_config(pDb, LSM_CONFIG_AUTOCHECKPOINT, &n);
+
+ rc = lsm_work(pDb, nMerge, nKB, 0);
+ if( rc!=LSM_OK ){
+ testPrintError("lsm_work(): rc=%d\n", rc);
+ }
+ }
+ }
+ if( rc==LSM_OK ){
+ rc = lsm_checkpoint(pDb, 0);
+ }
+
+ lsm_close(pDb);
+ return rc;
+
+ usage:
+ testPrintUsage("?-optimize? ?-n N? DATABASE");
+ return -1;
+}
+
+
+/*
+** lsmtest show ?-config LSM-CONFIG? DATABASE ?COMMAND ?PGNO??
+*/
+int do_show(int nArg, char **azArg){
+ lsm_db *pDb;
+ int rc;
+ const char *zDb;
+
+ int eOpt = LSM_INFO_DB_STRUCTURE;
+ unsigned int iPg = 0;
+ int bConfig = 0;
+ const char *zConfig = "";
+
+ struct Option {
+ const char *zName;
+ int bConfig;
+ int eOpt;
+ } aOpt [] = {
+ { "array", 0, LSM_INFO_ARRAY_STRUCTURE },
+ { "array-pages", 0, LSM_INFO_ARRAY_PAGES },
+ { "blocksize", 1, LSM_CONFIG_BLOCK_SIZE },
+ { "pagesize", 1, LSM_CONFIG_PAGE_SIZE },
+ { "freelist", 0, LSM_INFO_FREELIST },
+ { "page-ascii", 0, LSM_INFO_PAGE_ASCII_DUMP },
+ { "page-hex", 0, LSM_INFO_PAGE_HEX_DUMP },
+ { 0, 0 }
+ };
+
+ char *z = 0;
+ int iDb = 0; /* Index of DATABASE in azArg[] */
+
+ /* Check if there is a "-config" option: */
+ if( nArg>2 && strlen(azArg[0])>1
+ && memcmp(azArg[0], "-config", strlen(azArg[0]))==0
+ ){
+ zConfig = azArg[1];
+ iDb = 2;
+ }
+ if( nArg<(iDb+1) ) goto usage;
+
+ if( nArg>(iDb+1) ){
+ rc = testArgSelect(aOpt, "option", azArg[iDb+1], &eOpt);
+ if( rc!=0 ) return rc;
+ bConfig = aOpt[eOpt].bConfig;
+ eOpt = aOpt[eOpt].eOpt;
+ if( (bConfig==0 && eOpt==LSM_INFO_FREELIST)
+ || (bConfig==1 && eOpt==LSM_CONFIG_BLOCK_SIZE)
+ || (bConfig==1 && eOpt==LSM_CONFIG_PAGE_SIZE)
+ ){
+ if( nArg!=(iDb+2) ) goto usage;
+ }else{
+ if( nArg!=(iDb+3) ) goto usage;
+ iPg = atoi(azArg[iDb+2]);
+ }
+ }
+ zDb = azArg[iDb];
+
+ rc = lsm_new(0, &pDb);
+ tdb_lsm_configure(pDb, zConfig);
+ if( rc!=LSM_OK ){
+ testPrintError("lsm_new(): rc=%d\n", rc);
+ }else{
+ rc = lsm_open(pDb, zDb);
+ if( rc!=LSM_OK ){
+ testPrintError("lsm_open(): rc=%d\n", rc);
+ }
+ }
+
+ if( rc==LSM_OK ){
+ if( bConfig==0 ){
+ switch( eOpt ){
+ case LSM_INFO_DB_STRUCTURE:
+ case LSM_INFO_FREELIST:
+ rc = lsm_info(pDb, eOpt, &z);
+ break;
+ case LSM_INFO_ARRAY_STRUCTURE:
+ case LSM_INFO_ARRAY_PAGES:
+ case LSM_INFO_PAGE_ASCII_DUMP:
+ case LSM_INFO_PAGE_HEX_DUMP:
+ rc = lsm_info(pDb, eOpt, iPg, &z);
+ break;
+ default:
+ assert( !"no chance" );
+ }
+
+ if( rc==LSM_OK ){
+ printf("%s\n", z ? z : "");
+ fflush(stdout);
+ }
+ lsm_free(lsm_get_env(pDb), z);
+ }else{
+ int iRes = -1;
+ lsm_config(pDb, eOpt, &iRes);
+ printf("%d\n", iRes);
+ fflush(stdout);
+ }
+ }
+
+ lsm_close(pDb);
+ return rc;
+
+ usage:
+ testPrintUsage("DATABASE ?array|page-ascii|page-hex PGNO?");
+ return -1;
+}
diff --git a/ext/lsm1/lsm-test/lsmtest_io.c b/ext/lsm1/lsm-test/lsmtest_io.c
new file mode 100644
index 0000000..7aa5d10
--- /dev/null
+++ b/ext/lsm1/lsm-test/lsmtest_io.c
@@ -0,0 +1,248 @@
+
+/*
+** SUMMARY
+**
+** This file implements the 'io' subcommand of the test program. It is used
+** for testing the performance of various combinations of write() and fsync()
+** system calls. All operations occur on a single file, which may or may not
+** exist when a test is started.
+**
+** A test consists of a series of commands. Each command is either a write
+** or an fsync. A write is specified as "@", where
+** is the amount of data written, and is the offset of the file
+** to write to. An or an is specified as an integer number
+** of bytes. Or, if postfixed with a "K", "M" or "G", an integer number of
+** KB, MB or GB, respectively. An fsync is simply "S". All commands are
+** case-insensitive.
+**
+** Example test program:
+**
+** 2M@6M 1492K@4M S 4096@4K S
+**
+** This program writes 2 MB of data starting at the offset 6MB offset of
+** the file, followed by 1492 KB of data written at the 4MB offset of the
+** file, followed by a call to fsync(), a write of 4KB of data at byte
+** offset 4096, and finally another call to fsync().
+**
+** Commands may either be specified on the command line (one command per
+** command line argument) or read from stdin. Commands read from stdin
+** must be separated by white-space.
+**
+** COMMAND LINE INVOCATION
+**
+** The sub-command implemented in this file must be invoked with at least
+** two arguments - the path to the file to write to and the page-size to
+** use for writing. If there are more than two arguments, then each
+** subsequent argument is assumed to be a test command. If there are exactly
+** two arguments, the test commands are read from stdin.
+**
+** A write command does not result in a single call to system call write().
+** Instead, the specified region is written sequentially using one or
+** more calls to write(), each of which writes not more than one page of
+** data. For example, if the page-size is 4KB, the command "2M@6M" results
+** in 512 calls to write(), each of which writes 4KB of data.
+**
+** EXAMPLES
+**
+** Two equivalent examples:
+**
+** $ lsmtest io testfile.db 4KB 2M@6M 1492K@4M S 4096@4K S
+** 3544K written in 129 ms
+** $ echo "2M@6M 1492K@4M S 4096@4K S" | lsmtest io testfile.db 4096
+** 3544K written in 127 ms
+**
+*/
+
+#include "lsmtest.h"
+
+typedef struct IoContext IoContext;
+
+struct IoContext {
+ int fd;
+ int nWrite;
+};
+
+/*
+** As isspace(3)
+*/
+static int safe_isspace(char c){
+ if( c&0x80) return 0;
+ return isspace(c);
+}
+
+/*
+** As isdigit(3)
+*/
+static int safe_isdigit(char c){
+ if( c&0x80) return 0;
+ return isdigit(c);
+}
+
+static i64 getNextSize(char *zIn, char **pzOut, int *pRc){
+ i64 iRet = 0;
+ if( *pRc==0 ){
+ char *z = zIn;
+
+ if( !safe_isdigit(*z) ){
+ *pRc = 1;
+ return 0;
+ }
+
+ /* Process digits */
+ while( safe_isdigit(*z) ){
+ iRet = iRet*10 + (*z - '0');
+ z++;
+ }
+
+ /* Process suffix */
+ switch( *z ){
+ case 'k': case 'K':
+ iRet = iRet * 1024;
+ z++;
+ break;
+
+ case 'm': case 'M':
+ iRet = iRet * 1024 * 1024;
+ z++;
+ break;
+
+ case 'g': case 'G':
+ iRet = iRet * 1024 * 1024 * 1024;
+ z++;
+ break;
+ }
+
+ if( pzOut ) *pzOut = z;
+ }
+ return iRet;
+}
+
+static int doOneCmd(
+ IoContext *pCtx,
+ u8 *aData,
+ int pgsz,
+ char *zCmd,
+ char **pzOut
+){
+ char c;
+ char *z = zCmd;
+
+ while( safe_isspace(*z) ) z++;
+ c = *z;
+
+ if( c==0 ){
+ if( pzOut ) *pzOut = z;
+ return 0;
+ }
+
+ if( c=='s' || c=='S' ){
+ if( pzOut ) *pzOut = &z[1];
+ return fdatasync(pCtx->fd);
+ }
+
+ if( safe_isdigit(c) ){
+ i64 iOff = 0;
+ int nByte = 0;
+ int rc = 0;
+ int nPg;
+ int iPg;
+
+ nByte = (int)getNextSize(z, &z, &rc);
+ if( rc || *z!='@' ) goto bad_command;
+ z++;
+ iOff = getNextSize(z, &z, &rc);
+ if( rc || (safe_isspace(*z)==0 && *z!='\0') ) goto bad_command;
+ if( pzOut ) *pzOut = z;
+
+ nPg = (nByte+pgsz-1) / pgsz;
+ lseek(pCtx->fd, (off_t)iOff, SEEK_SET);
+ for(iPg=0; iPgfd, aData, pgsz);
+ }
+ pCtx->nWrite += nByte/1024;
+
+ return 0;
+ }
+
+ bad_command:
+ testPrintError("unrecognized command: %s", zCmd);
+ return 1;
+}
+
+static int readStdin(char **pzOut){
+ int nAlloc = 128;
+ char *zOut = 0;
+ int nOut = 0;
+
+ while( !feof(stdin) ){
+ int nRead;
+
+ nAlloc = nAlloc*2;
+ zOut = realloc(zOut, nAlloc);
+ nRead = fread(&zOut[nOut], 1, nAlloc-nOut-1, stdin);
+
+ if( nRead==0 ) break;
+ nOut += nRead;
+ zOut[nOut] = '\0';
+ }
+
+ *pzOut = zOut;
+ return 0;
+}
+
+int do_io(int nArg, char **azArg){
+ IoContext ctx;
+ int pgsz;
+ char *zFile;
+ char *zPgsz;
+ int i;
+ int rc = 0;
+
+ char *zStdin = 0;
+ char *z;
+
+ u8 *aData;
+
+ memset(&ctx, 0, sizeof(IoContext));
+ if( nArg<2 ){
+ testPrintUsage("FILE PGSZ ?CMD-1 ...?");
+ return -1;
+ }
+ zFile = azArg[0];
+ zPgsz = azArg[1];
+
+ pgsz = (int)getNextSize(zPgsz, 0, &rc);
+ if( pgsz<=0 ){
+ testPrintError("Ridiculous page size: %d", pgsz);
+ return -1;
+ }
+ aData = malloc(pgsz);
+ memset(aData, 0x77, pgsz);
+
+ ctx.fd = open(zFile, O_RDWR|O_CREAT|_O_BINARY, 0644);
+ if( ctx.fd<0 ){
+ perror("open: ");
+ return -1;
+ }
+
+ if( nArg==2 ){
+ readStdin(&zStdin);
+ testTimeInit();
+ z = zStdin;
+ while( *z && rc==0 ){
+ rc = doOneCmd(&ctx, aData, pgsz, z, &z);
+ }
+ }else{
+ testTimeInit();
+ for(i=2; i
+
+void test_failed(){
+ assert( 0 );
+ return;
+}
+
+#define testSetError(rc) testSetErrorFunc(rc, pRc, __FILE__, __LINE__)
+static void testSetErrorFunc(int rc, int *pRc, const char *zFile, int iLine){
+ if( rc ){
+ *pRc = rc;
+ fprintf(stderr, "FAILED (%s:%d) rc=%d ", zFile, iLine, rc);
+ test_failed();
+ }
+}
+
+static int lsm_memcmp(u8 *a, u8 *b, int c){
+ int i;
+ for(i=0; i0 && lsm_memcmp(pVal, pDbVal, nVal))) ){
+ testSetError(1);
+ }
+ }
+}
+
+void testWrite(
+ TestDb *pDb, /* Database handle */
+ void *pKey, int nKey, /* Key to query database for */
+ void *pVal, int nVal, /* Value to write */
+ int *pRc /* IN/OUT: Error code */
+){
+ if( *pRc==0 ){
+ int rc;
+static int nCall = 0;
+nCall++;
+ rc = tdb_write(pDb, pKey, nKey, pVal, nVal);
+ testSetError(rc);
+ }
+}
+void testDelete(
+ TestDb *pDb, /* Database handle */
+ void *pKey, int nKey, /* Key to query database for */
+ int *pRc /* IN/OUT: Error code */
+){
+ if( *pRc==0 ){
+ int rc;
+ *pRc = rc = tdb_delete(pDb, pKey, nKey);
+ testSetError(rc);
+ }
+}
+void testDeleteRange(
+ TestDb *pDb, /* Database handle */
+ void *pKey1, int nKey1,
+ void *pKey2, int nKey2,
+ int *pRc /* IN/OUT: Error code */
+){
+ if( *pRc==0 ){
+ int rc;
+ *pRc = rc = tdb_delete_range(pDb, pKey1, nKey1, pKey2, nKey2);
+ testSetError(rc);
+ }
+}
+
+void testBegin(TestDb *pDb, int iTrans, int *pRc){
+ if( *pRc==0 ){
+ int rc;
+ rc = tdb_begin(pDb, iTrans);
+ testSetError(rc);
+ }
+}
+void testCommit(TestDb *pDb, int iTrans, int *pRc){
+ if( *pRc==0 ){
+ int rc;
+ rc = tdb_commit(pDb, iTrans);
+ testSetError(rc);
+ }
+}
+#if 0 /* unused */
+static void testRollback(TestDb *pDb, int iTrans, int *pRc){
+ if( *pRc==0 ){
+ int rc;
+ rc = tdb_rollback(pDb, iTrans);
+ testSetError(rc);
+ }
+}
+#endif
+
+void testWriteStr(
+ TestDb *pDb, /* Database handle */
+ const char *zKey, /* Key to query database for */
+ const char *zVal, /* Value to write */
+ int *pRc /* IN/OUT: Error code */
+){
+ int nVal = (zVal ? strlen(zVal) : 0);
+ testWrite(pDb, (void *)zKey, strlen(zKey), (void *)zVal, nVal, pRc);
+}
+
+#if 0 /* unused */
+static void testDeleteStr(TestDb *pDb, const char *zKey, int *pRc){
+ testDelete(pDb, (void *)zKey, strlen(zKey), pRc);
+}
+#endif
+void testFetchStr(
+ TestDb *pDb, /* Database handle */
+ const char *zKey, /* Key to query database for */
+ const char *zVal, /* Value to write */
+ int *pRc /* IN/OUT: Error code */
+){
+ int nVal = (zVal ? strlen(zVal) : 0);
+ testFetch(pDb, (void *)zKey, strlen(zKey), (void *)zVal, nVal, pRc);
+}
+
+void testFetchCompare(
+ TestDb *pControl,
+ TestDb *pDb,
+ void *pKey, int nKey,
+ int *pRc
+){
+ int rc;
+ void *pDbVal1;
+ void *pDbVal2;
+ int nDbVal1;
+ int nDbVal2;
+
+ static int nCall = 0;
+ nCall++;
+
+ rc = tdb_fetch(pControl, pKey, nKey, &pDbVal1, &nDbVal1);
+ testSetError(rc);
+
+ rc = tdb_fetch(pDb, pKey, nKey, &pDbVal2, &nDbVal2);
+ testSetError(rc);
+
+ if( *pRc==0
+ && (nDbVal1!=nDbVal2 || (nDbVal1>0 && memcmp(pDbVal1, pDbVal2, nDbVal1)))
+ ){
+ testSetError(1);
+ }
+}
+
+typedef struct ScanResult ScanResult;
+struct ScanResult {
+ TestDb *pDb;
+
+ int nRow;
+ u32 cksum1;
+ u32 cksum2;
+ void *pKey1; int nKey1;
+ void *pKey2; int nKey2;
+
+ int bReverse;
+ int nPrevKey;
+ u8 aPrevKey[256];
+};
+
+static int keyCompare(void *pKey1, int nKey1, void *pKey2, int nKey2){
+ int res;
+ res = memcmp(pKey1, pKey2, MIN(nKey1, nKey2));
+ if( res==0 ){
+ res = nKey1 - nKey2;
+ }
+ return res;
+}
+
+int test_scan_debug = 0;
+
+static void scanCompareCb(
+ void *pCtx,
+ void *pKey, int nKey,
+ void *pVal, int nVal
+){
+ ScanResult *p = (ScanResult *)pCtx;
+ u8 *aKey = (u8 *)pKey;
+ u8 *aVal = (u8 *)pVal;
+ int i;
+
+ if( test_scan_debug ){
+ printf("%d: %.*s\n", p->nRow, nKey, (char *)pKey);
+ fflush(stdout);
+ }
+#if 0
+ if( test_scan_debug ) printf("%.20s\n", (char *)pVal);
+#endif
+
+#if 0
+ /* Check tdb_fetch() matches */
+ int rc = 0;
+ testFetch(p->pDb, pKey, nKey, pVal, nVal, &rc);
+ assert( rc==0 );
+#endif
+
+ /* Update the checksum data */
+ p->nRow++;
+ for(i=0; icksum1 += ((int)aKey[i] << (i&0x0F));
+ p->cksum2 += p->cksum1;
+ }
+ for(i=0; icksum1 += ((int)aVal[i] << (i&0x0F));
+ p->cksum2 += p->cksum1;
+ }
+
+ /* Check that the delivered row is not out of order. */
+ if( nKey<(int)sizeof(p->aPrevKey) ){
+ if( p->nPrevKey ){
+ int res = keyCompare(p->aPrevKey, p->nPrevKey, pKey, nKey);
+ if( (res<0 && p->bReverse) || (res>0 && p->bReverse==0) ){
+ testPrintError("Returned key out of order at %s:%d\n",
+ __FILE__, __LINE__
+ );
+ }
+ }
+
+ p->nPrevKey = nKey;
+ memcpy(p->aPrevKey, pKey, MIN(p->nPrevKey, nKey));
+ }
+
+ /* Check that the delivered row is within range. */
+ if( p->pKey1 && (
+ (memcmp(p->pKey1, pKey, MIN(p->nKey1, nKey))>0)
+ || (memcmp(p->pKey1, pKey, MIN(p->nKey1, nKey))==0 && p->nKey1>nKey)
+ )){
+ testPrintError("Returned key too small at %s:%d\n", __FILE__, __LINE__);
+ }
+ if( p->pKey2 && (
+ (memcmp(p->pKey2, pKey, MIN(p->nKey2, nKey))<0)
+ || (memcmp(p->pKey2, pKey, MIN(p->nKey2, nKey))==0 && p->nKey2=0 );
+ zRet = (char *)testMalloc(nByte+1);
+ vsnprintf(zRet, nByte+1, zFormat, ap);
+ return zRet;
+}
+
+char *testMallocPrintf(const char *zFormat, ...){
+ va_list ap;
+ char *zRet;
+
+ va_start(ap, zFormat);
+ zRet = testMallocVPrintf(zFormat, ap);
+ va_end(ap);
+
+ return zRet;
+}
+
+
+/*
+** A wrapper around malloc(3).
+**
+** This function should be used for all allocations made by test procedures.
+** It has the following properties:
+**
+** * Test code may assume that allocations may not fail.
+** * Returned memory is always zeroed.
+**
+** Allocations made using testMalloc() should be freed using testFree().
+*/
+void *testMalloc(int n){
+ u8 *p = (u8*)malloc(n + 8);
+ memset(p, 0, n+8);
+ *(int*)p = n;
+ return (void*)&p[8];
+}
+
+void *testMallocCopy(void *pCopy, int nByte){
+ void *pRet = testMalloc(nByte);
+ memcpy(pRet, pCopy, nByte);
+ return pRet;
+}
+
+void *testRealloc(void *ptr, int n){
+ if( ptr ){
+ u8 *p = (u8*)ptr - 8;
+ int nOrig = *(int*)p;
+ p = (u8*)realloc(p, n+8);
+ if( nOrig1 ){
+ testPrintError("Usage: test ?PATTERN?\n");
+ return 1;
+ }
+ if( nArg==1 ){
+ zPattern = azArg[0];
+ }
+
+ for(j=0; tdb_system_name(j); j++){
+ rc = 0;
+
+ test_data_1(tdb_system_name(j), zPattern, &rc);
+ test_data_2(tdb_system_name(j), zPattern, &rc);
+ test_data_3(tdb_system_name(j), zPattern, &rc);
+ test_data_4(tdb_system_name(j), zPattern, &rc);
+ test_rollback(tdb_system_name(j), zPattern, &rc);
+ test_mc(tdb_system_name(j), zPattern, &rc);
+ test_mt(tdb_system_name(j), zPattern, &rc);
+
+ if( rc ) nFail++;
+ }
+
+ rc = 0;
+ test_oom(zPattern, &rc);
+ if( rc ) nFail++;
+
+ rc = 0;
+ test_api(zPattern, &rc);
+ if( rc ) nFail++;
+
+ rc = 0;
+ do_crash_test(zPattern, &rc);
+ if( rc ) nFail++;
+
+ rc = 0;
+ do_writer_crash_test(zPattern, &rc);
+ if( rc ) nFail++;
+
+ return (nFail!=0);
+}
+
+static lsm_db *configure_lsm_db(TestDb *pDb){
+ lsm_db *pLsm;
+ pLsm = tdb_lsm(pDb);
+ if( pLsm ){
+ tdb_lsm_config_str(pDb, "mmap=1 autowork=1 automerge=4 worker_automerge=4");
+ }
+ return pLsm;
+}
+
+typedef struct WriteHookEvent WriteHookEvent;
+struct WriteHookEvent {
+ i64 iOff;
+ int nData;
+ int nUs;
+};
+WriteHookEvent prev = {0, 0, 0};
+
+static void flushPrev(FILE *pOut){
+ if( prev.nData ){
+ fprintf(pOut, "w %s %lld %d %d\n", "d", prev.iOff, prev.nData, prev.nUs);
+ prev.nData = 0;
+ }
+}
+
+#if 0 /* unused */
+static void do_speed_write_hook2(
+ void *pCtx,
+ int bLog,
+ i64 iOff,
+ int nData,
+ int nUs
+){
+ FILE *pOut = (FILE *)pCtx;
+ if( bLog ) return;
+
+ if( prev.nData && nData && iOff==prev.iOff+prev.nData ){
+ prev.nData += nData;
+ prev.nUs += nUs;
+ }else{
+ flushPrev(pOut);
+ if( nData==0 ){
+ fprintf(pOut, "s %s 0 0 %d\n", (bLog ? "l" : "d"), nUs);
+ }else{
+ prev.iOff = iOff;
+ prev.nData = nData;
+ prev.nUs = nUs;
+ }
+ }
+}
+#endif
+
+#define ST_REPEAT 0
+#define ST_WRITE 1
+#define ST_PAUSE 2
+#define ST_FETCH 3
+#define ST_SCAN 4
+#define ST_NSCAN 5
+#define ST_KEYSIZE 6
+#define ST_VALSIZE 7
+#define ST_TRANS 8
+
+
+static void print_speed_test_help(){
+ printf(
+"\n"
+"Repeat the following $repeat times:\n"
+" 1. Insert $write key-value pairs. One transaction for each write op.\n"
+" 2. Pause for $pause ms.\n"
+" 3. Perform $fetch queries on the database.\n"
+"\n"
+" Keys are $keysize bytes in size. Values are $valsize bytes in size\n"
+" Both keys and values are pseudo-randomly generated\n"
+"\n"
+"Options are:\n"
+" -repeat $repeat (default value 10)\n"
+" -write $write (default value 10000)\n"
+" -pause $pause (default value 0)\n"
+" -fetch $fetch (default value 0)\n"
+" -keysize $keysize (default value 12)\n"
+" -valsize $valsize (default value 100)\n"
+" -system $system (default value \"lsm\")\n"
+" -trans $trans (default value 0)\n"
+"\n"
+);
+}
+
+int do_speed_test2(int nArg, char **azArg){
+ struct Option {
+ const char *zOpt;
+ int eVal;
+ int iDefault;
+ } aOpt[] = {
+ { "-repeat", ST_REPEAT, 10},
+ { "-write", ST_WRITE, 10000},
+ { "-pause", ST_PAUSE, 0},
+ { "-fetch", ST_FETCH, 0},
+ { "-scan", ST_SCAN, 0},
+ { "-nscan", ST_NSCAN, 0},
+ { "-keysize", ST_KEYSIZE, 12},
+ { "-valsize", ST_VALSIZE, 100},
+ { "-trans", ST_TRANS, 0},
+ { "-system", -1, 0},
+ { "help", -2, 0},
+ {0, 0, 0}
+ };
+ int i;
+ int aParam[9];
+ int rc = 0;
+ int bReadonly = 0;
+ int nContent = 0;
+
+ TestDb *pDb;
+ Datasource *pData;
+ DatasourceDefn defn = { TEST_DATASOURCE_RANDOM, 0, 0, 0, 0 };
+ char *zSystem = "";
+ int bLsm = 1;
+ FILE *pLog = 0;
+
+#ifdef NDEBUG
+ /* If NDEBUG is defined, disable the dynamic memory related checks in
+ ** lsmtest_mem.c. They slow things down. */
+ testMallocUninstall(tdb_lsm_env());
+#endif
+
+ /* Initialize aParam[] with default values. */
+ for(i=0; i=0 ){
+ aParam[aOpt[iSel].eVal] = atoi(azArg[i+1]);
+ }else{
+ zSystem = azArg[i+1];
+ bLsm = 0;
+#if 0
+ for(j=0; zSystem[j]; j++){
+ if( zSystem[j]=='=' ) bLsm = 1;
+ }
+#endif
+ }
+ }
+
+ printf("#");
+ for(i=0; i=0 ){
+ printf(" %s=%d", &aOpt[i].zOpt[1], aParam[aOpt[i].eVal]);
+ }else if( aOpt[i].eVal==-1 ){
+ printf(" %s=\"%s\"", &aOpt[i].zOpt[1], zSystem);
+ }
+ }
+ }
+ printf("\n");
+
+ defn.nMinKey = defn.nMaxKey = aParam[ST_KEYSIZE];
+ defn.nMinVal = defn.nMaxVal = aParam[ST_VALSIZE];
+ pData = testDatasourceNew(&defn);
+
+ if( aParam[ST_WRITE]==0 ){
+ bReadonly = 1;
+ }
+
+ if( bLsm ){
+ rc = tdb_lsm_open(zSystem, "testdb.lsm", !bReadonly, &pDb);
+ }else{
+ pDb = testOpen(zSystem, !bReadonly, &rc);
+ }
+ if( rc!=0 ) return rc;
+ if( bReadonly ){
+ nContent = testCountDatabase(pDb);
+ }
+
+#if 0
+ pLog = fopen("/tmp/speed.log", "w");
+ tdb_lsm_write_hook(pDb, do_speed_write_hook2, (void *)pLog);
+#endif
+
+ for(i=0; i=nArg ){
+ testPrintError("option %s requires an argument\n", aOpt[iSel].zOpt);
+ return 1;
+ }
+ if( aOpt[iSel].isSwitch==1 ){
+ nRow = atoi(azArg[i]);
+ }
+ if( aOpt[iSel].isSwitch==2 ){
+ nSleep = atoi(azArg[i]);
+ }
+ if( aOpt[iSel].isSwitch==3 ){
+ struct Mode {
+ const char *zMode;
+ int doReadTest;
+ int doWriteTest;
+ } aMode[] = {{"ro", 1, 0} , {"rw", 1, 1}, {"wo", 0, 1}, {0, 0, 0}};
+ int iMode;
+ rc = testArgSelect(aMode, "option", azArg[i], &iMode);
+ if( rc ) return rc;
+ doReadTest = aMode[iMode].doReadTest;
+ doWriteTest = aMode[iMode].doWriteTest;
+ }
+ if( aOpt[iSel].isSwitch==4 ){
+ /* The "-out FILE" switch. This option is used to specify a file to
+ ** write the gnuplot script to. */
+ zOut = azArg[i];
+ }
+ }else{
+ /* A db name */
+ rc = testArgSelect(aOpt, "system", azArg[i], &iSel);
+ if( rc ) return rc;
+ sys_mask |= (1< 100000) ? 100000 : nSelStep;
+
+ aTime = malloc(sizeof(int) * ArraySize(aSys) * nRow/nStep);
+ aWrite = malloc(sizeof(int) * nRow/nStep);
+ aSelTime = malloc(sizeof(int) * ArraySize(aSys) * nRow/nSelStep);
+
+ /* This loop collects the INSERT speed data. */
+ if( doWriteTest ){
+ printf("Writing output to file \"%s\".\n", zOut);
+
+ for(j=0; aSys[j].zLibrary; j++){
+ FILE *pLog = 0;
+ TestDb *pDb; /* Database being tested */
+ lsm_db *pLsm;
+ int iDot = 0;
+
+ if( ((1<nData ){
+ fprintf(pHook->pOut, "write %s %d %d\n",
+ (pHook->bLog ? "log" : "db"), (int)pHook->iOff, pHook->nData
+ );
+ pHook->nData = 0;
+ fflush(pHook->pOut);
+ }
+}
+
+static void do_insert_write_hook(
+ void *pCtx,
+ int bLog,
+ i64 iOff,
+ int nData,
+ int nUs
+){
+ InsertWriteHook *pHook = (InsertWriteHook *)pCtx;
+ if( bLog ) return;
+
+ if( nData==0 ){
+ flushHook(pHook);
+ fprintf(pHook->pOut, "sync %s\n", (bLog ? "log" : "db"));
+ }else if( pHook->nData
+ && bLog==pHook->bLog
+ && iOff==(pHook->iOff+pHook->nData)
+ ){
+ pHook->nData += nData;
+ }else{
+ flushHook(pHook);
+ pHook->bLog = bLog;
+ pHook->iOff = iOff;
+ pHook->nData = nData;
+ }
+}
+
+static int do_replay(int nArg, char **azArg){
+ char aBuf[4096];
+ FILE *pInput;
+ FILE *pClose = 0;
+ const char *zDb;
+
+ lsm_env *pEnv;
+ lsm_file *pOut;
+ int rc;
+
+ if( nArg!=2 ){
+ testPrintError("Usage: replay WRITELOG FILE\n");
+ return 1;
+ }
+
+ if( strcmp(azArg[0], "-")==0 ){
+ pInput = stdin;
+ }else{
+ pClose = pInput = fopen(azArg[0], "r");
+ }
+ zDb = azArg[1];
+ pEnv = tdb_lsm_env();
+ rc = pEnv->xOpen(pEnv, zDb, 0, &pOut);
+ if( rc!=LSM_OK ) return rc;
+
+ while( feof(pInput)==0 ){
+ char zLine[80];
+ fgets(zLine, sizeof(zLine)-1, pInput);
+ zLine[sizeof(zLine)-1] = '\0';
+
+ if( 0==memcmp("sync db", zLine, 7) ){
+ rc = pEnv->xSync(pOut);
+ if( rc!=0 ) break;
+ }else{
+ int iOff;
+ int nData;
+ int nMatch;
+ nMatch = sscanf(zLine, "write db %d %d", &iOff, &nData);
+ if( nMatch==2 ){
+ int i;
+ for(i=0; ixWrite(pOut, iOff+i, aBuf, sizeof(aBuf));
+ if( rc!=0 ) break;
+ }
+ }
+ }
+ }
+ if( pClose ) fclose(pClose);
+ pEnv->xClose(pOut);
+
+ return rc;
+}
+
+static int do_insert(int nArg, char **azArg){
+ const char *zDb = "lsm";
+ TestDb *pDb = 0;
+ int i;
+ int rc;
+ const int nRow = 1 * 1000 * 1000;
+
+ DatasourceDefn defn = { TEST_DATASOURCE_RANDOM, 8, 15, 80, 150 };
+ Datasource *pData = 0;
+
+ if( nArg>1 ){
+ testPrintError("Usage: insert ?DATABASE?\n");
+ return 1;
+ }
+ if( nArg==1 ){ zDb = azArg[0]; }
+
+ testMallocUninstall(tdb_lsm_env());
+ for(i=0; zDb[i] && zDb[i]!='='; i++);
+ if( zDb[i] ){
+ rc = tdb_lsm_open(zDb, "testdb.lsm", 1, &pDb);
+ }else{
+ rc = tdb_open(zDb, 0, 1, &pDb);
+ }
+
+ if( rc!=0 ){
+ testPrintError("Error opening db \"%s\": %d\n", zDb, rc);
+ }else{
+ InsertWriteHook hook;
+ memset(&hook, 0, sizeof(hook));
+ hook.pOut = fopen("writelog.txt", "w");
+
+ pData = testDatasourceNew(&defn);
+ tdb_lsm_config_work_hook(pDb, do_insert_work_hook, 0);
+ tdb_lsm_write_hook(pDb, do_insert_write_hook, (void *)&hook);
+
+ if( rc==0 ){
+ for(i=0; i
+#include
+
+static void lsmtest_rusage_report(void){
+ struct rusage r;
+ memset(&r, 0, sizeof(r));
+
+ getrusage(RUSAGE_SELF, &r);
+ printf("# getrusage: { ru_maxrss %d ru_oublock %d ru_inblock %d }\n",
+ (int)r.ru_maxrss, (int)r.ru_oublock, (int)r.ru_inblock
+ );
+}
+#else
+static void lsmtest_rusage_report(void){
+ /* no-op */
+}
+#endif
+
+int main(int argc, char **argv){
+ struct TestFunc {
+ const char *zName;
+ int bRusageReport;
+ int (*xFunc)(int, char **);
+ } aTest[] = {
+ {"random", 1, do_random_tests},
+ {"writespeed", 1, do_writer_test},
+ {"io", 1, st_do_io},
+
+ {"insert", 1, do_insert},
+ {"replay", 1, do_replay},
+
+ {"speed", 1, do_speed_tests},
+ {"speed2", 1, do_speed_test2},
+ {"show", 0, st_do_show},
+ {"work", 1, st_do_work},
+ {"test", 1, do_test},
+
+ {0, 0}
+ };
+ int rc; /* Return Code */
+ int iFunc; /* Index into aTest[] */
+
+ int nLeakAlloc = 0; /* Allocations leaked by lsm */
+ int nLeakByte = 0; /* Bytes leaked by lsm */
+
+#ifdef LSM_DEBUG_MEM
+ FILE *pReport = 0; /* lsm malloc() report file */
+ const char *zReport = "malloc.txt generated";
+#else
+ const char *zReport = "malloc.txt NOT generated";
+#endif
+
+ testMallocInstall(tdb_lsm_env());
+
+ if( argc<2 ){
+ testPrintError("Usage: %s sub-command ?args...?\n", argv[0]);
+ return -1;
+ }
+
+ /* Initialize error reporting */
+ testErrorInit(argc, argv);
+
+ /* Initialize PRNG system */
+ testPrngInit();
+
+ rc = testArgSelect(aTest, "sub-command", argv[1], &iFunc);
+ if( rc==0 ){
+ rc = aTest[iFunc].xFunc(argc-2, &argv[2]);
+ }
+
+#ifdef LSM_DEBUG_MEM
+ pReport = fopen("malloc.txt", "w");
+ testMallocCheck(tdb_lsm_env(), &nLeakAlloc, &nLeakByte, pReport);
+ fclose(pReport);
+#else
+ testMallocCheck(tdb_lsm_env(), &nLeakAlloc, &nLeakByte, 0);
+#endif
+
+ if( nLeakAlloc ){
+ testPrintError("Leaked %d bytes in %d allocations (%s)\n",
+ nLeakByte, nLeakAlloc, zReport
+ );
+ if( rc==0 ) rc = -1;
+ }
+ testMallocUninstall(tdb_lsm_env());
+
+ if( aTest[iFunc].bRusageReport ){
+ lsmtest_rusage_report();
+ }
+ return rc;
+}
diff --git a/ext/lsm1/lsm-test/lsmtest_mem.c b/ext/lsm1/lsm-test/lsmtest_mem.c
new file mode 100644
index 0000000..4c35e84
--- /dev/null
+++ b/ext/lsm1/lsm-test/lsmtest_mem.c
@@ -0,0 +1,409 @@
+
+#include
+#include
+#include
+
+#define ArraySize(x) ((int)(sizeof(x) / sizeof((x)[0])))
+
+#define MIN(x,y) ((x)<(y) ? (x) : (y))
+
+typedef unsigned int u32;
+typedef unsigned char u8;
+typedef long long int i64;
+typedef unsigned long long int u64;
+
+#if defined(__GLIBC__) && defined(LSM_DEBUG_MEM)
+ extern int backtrace(void**,int);
+ extern void backtrace_symbols_fd(void*const*,int,int);
+# define TM_BACKTRACE 12
+#else
+# define backtrace(A,B) 1
+# define backtrace_symbols_fd(A,B,C)
+#endif
+
+
+typedef struct TmBlockHdr TmBlockHdr;
+typedef struct TmAgg TmAgg;
+typedef struct TmGlobal TmGlobal;
+
+struct TmGlobal {
+ /* Linked list of all currently outstanding allocations. And a table of
+ ** all allocations, past and present, indexed by backtrace() info. */
+ TmBlockHdr *pFirst;
+#ifdef TM_BACKTRACE
+ TmAgg *aHash[10000];
+#endif
+
+ /* Underlying malloc/realloc/free functions */
+ void *(*xMalloc)(int); /* underlying malloc(3) function */
+ void *(*xRealloc)(void *, int); /* underlying realloc(3) function */
+ void (*xFree)(void *); /* underlying free(3) function */
+
+ /* Mutex to protect pFirst and aHash */
+ void (*xEnterMutex)(TmGlobal*); /* Call this to enter the mutex */
+ void (*xLeaveMutex)(TmGlobal*); /* Call this to leave mutex */
+ void (*xDelMutex)(TmGlobal*); /* Call this to delete mutex */
+ void *pMutex; /* Mutex handle */
+
+ void *(*xSaveMalloc)(void *, size_t);
+ void *(*xSaveRealloc)(void *, void *, size_t);
+ void (*xSaveFree)(void *, void *);
+
+ /* OOM injection scheduling. If nCountdown is greater than zero when a
+ ** malloc attempt is made, it is decremented. If this means nCountdown
+ ** transitions from 1 to 0, then the allocation fails. If bPersist is true
+ ** when this happens, nCountdown is then incremented back to 1 (so that the
+ ** next attempt fails too).
+ */
+ int nCountdown;
+ int bPersist;
+ int bEnable;
+ void (*xHook)(void *);
+ void *pHookCtx;
+};
+
+struct TmBlockHdr {
+ TmBlockHdr *pNext;
+ TmBlockHdr *pPrev;
+ int nByte;
+#ifdef TM_BACKTRACE
+ TmAgg *pAgg;
+#endif
+ u32 iForeGuard;
+};
+
+#ifdef TM_BACKTRACE
+struct TmAgg {
+ int nAlloc; /* Number of allocations at this path */
+ int nByte; /* Total number of bytes allocated */
+ int nOutAlloc; /* Number of outstanding allocations */
+ int nOutByte; /* Number of outstanding bytes */
+ void *aFrame[TM_BACKTRACE]; /* backtrace() output */
+ TmAgg *pNext; /* Next object in hash-table collision */
+};
+#endif
+
+#define FOREGUARD 0x80F5E153
+#define REARGUARD 0xE4676B53
+static const u32 rearguard = REARGUARD;
+
+#define ROUND8(x) (((x)+7)&~7)
+
+#define BLOCK_HDR_SIZE (ROUND8( sizeof(TmBlockHdr) ))
+
+static void lsmtest_oom_error(void){
+ static int nErr = 0;
+ nErr++;
+}
+
+static void tmEnterMutex(TmGlobal *pTm){
+ pTm->xEnterMutex(pTm);
+}
+static void tmLeaveMutex(TmGlobal *pTm){
+ pTm->xLeaveMutex(pTm);
+}
+
+static void *tmMalloc(TmGlobal *pTm, int nByte){
+ TmBlockHdr *pNew; /* New allocation header block */
+ u8 *pUser; /* Return value */
+ int nReq; /* Total number of bytes requested */
+
+ assert( sizeof(rearguard)==4 );
+ nReq = BLOCK_HDR_SIZE + nByte + 4;
+ pNew = (TmBlockHdr *)pTm->xMalloc(nReq);
+ memset(pNew, 0, sizeof(TmBlockHdr));
+
+ tmEnterMutex(pTm);
+ assert( pTm->nCountdown>=0 );
+ assert( pTm->bPersist==0 || pTm->bPersist==1 );
+
+ if( pTm->bEnable && pTm->nCountdown==1 ){
+ /* Simulate an OOM error. */
+ lsmtest_oom_error();
+ pTm->xFree(pNew);
+ pTm->nCountdown = pTm->bPersist;
+ if( pTm->xHook ) pTm->xHook(pTm->pHookCtx);
+ pUser = 0;
+ }else{
+ if( pTm->bEnable && pTm->nCountdown ) pTm->nCountdown--;
+
+ pNew->iForeGuard = FOREGUARD;
+ pNew->nByte = nByte;
+ pNew->pNext = pTm->pFirst;
+
+ if( pTm->pFirst ){
+ pTm->pFirst->pPrev = pNew;
+ }
+ pTm->pFirst = pNew;
+
+ pUser = &((u8 *)pNew)[BLOCK_HDR_SIZE];
+ memset(pUser, 0x56, nByte);
+ memcpy(&pUser[nByte], &rearguard, 4);
+
+#ifdef TM_BACKTRACE
+ {
+ TmAgg *pAgg;
+ int i;
+ u32 iHash = 0;
+ void *aFrame[TM_BACKTRACE];
+ memset(aFrame, 0, sizeof(aFrame));
+ backtrace(aFrame, TM_BACKTRACE);
+
+ for(i=0; iaHash);
+
+ for(pAgg=pTm->aHash[iHash]; pAgg; pAgg=pAgg->pNext){
+ if( memcmp(pAgg->aFrame, aFrame, sizeof(aFrame))==0 ) break;
+ }
+ if( !pAgg ){
+ pAgg = (TmAgg *)pTm->xMalloc(sizeof(TmAgg));
+ memset(pAgg, 0, sizeof(TmAgg));
+ memcpy(pAgg->aFrame, aFrame, sizeof(aFrame));
+ pAgg->pNext = pTm->aHash[iHash];
+ pTm->aHash[iHash] = pAgg;
+ }
+ pAgg->nAlloc++;
+ pAgg->nByte += nByte;
+ pAgg->nOutAlloc++;
+ pAgg->nOutByte += nByte;
+ pNew->pAgg = pAgg;
+ }
+#endif
+ }
+
+ tmLeaveMutex(pTm);
+ return pUser;
+}
+
+static void tmFree(TmGlobal *pTm, void *p){
+ if( p ){
+ TmBlockHdr *pHdr;
+ u8 *pUser = (u8 *)p;
+
+ tmEnterMutex(pTm);
+ pHdr = (TmBlockHdr *)(pUser - BLOCK_HDR_SIZE);
+ assert( pHdr->iForeGuard==FOREGUARD );
+ assert( 0==memcmp(&pUser[pHdr->nByte], &rearguard, 4) );
+
+ if( pHdr->pPrev ){
+ assert( pHdr->pPrev->pNext==pHdr );
+ pHdr->pPrev->pNext = pHdr->pNext;
+ }else{
+ assert( pHdr==pTm->pFirst );
+ pTm->pFirst = pHdr->pNext;
+ }
+ if( pHdr->pNext ){
+ assert( pHdr->pNext->pPrev==pHdr );
+ pHdr->pNext->pPrev = pHdr->pPrev;
+ }
+
+#ifdef TM_BACKTRACE
+ pHdr->pAgg->nOutAlloc--;
+ pHdr->pAgg->nOutByte -= pHdr->nByte;
+#endif
+
+ tmLeaveMutex(pTm);
+ memset(pUser, 0x58, pHdr->nByte);
+ memset(pHdr, 0x57, sizeof(TmBlockHdr));
+ pTm->xFree(pHdr);
+ }
+}
+
+static void *tmRealloc(TmGlobal *pTm, void *p, int nByte){
+ void *pNew;
+
+ pNew = tmMalloc(pTm, nByte);
+ if( pNew && p ){
+ TmBlockHdr *pHdr;
+ u8 *pUser = (u8 *)p;
+ pHdr = (TmBlockHdr *)(pUser - BLOCK_HDR_SIZE);
+ memcpy(pNew, p, MIN(nByte, pHdr->nByte));
+ tmFree(pTm, p);
+ }
+ return pNew;
+}
+
+static void tmMallocOom(
+ TmGlobal *pTm,
+ int nCountdown,
+ int bPersist,
+ void (*xHook)(void *),
+ void *pHookCtx
+){
+ assert( nCountdown>=0 );
+ assert( bPersist==0 || bPersist==1 );
+ pTm->nCountdown = nCountdown;
+ pTm->bPersist = bPersist;
+ pTm->xHook = xHook;
+ pTm->pHookCtx = pHookCtx;
+ pTm->bEnable = 1;
+}
+
+static void tmMallocOomEnable(
+ TmGlobal *pTm,
+ int bEnable
+){
+ pTm->bEnable = bEnable;
+}
+
+static void tmMallocCheck(
+ TmGlobal *pTm,
+ int *pnLeakAlloc,
+ int *pnLeakByte,
+ FILE *pFile
+){
+ TmBlockHdr *pHdr;
+ int nLeak = 0;
+ int nByte = 0;
+
+ if( pTm==0 ) return;
+
+ for(pHdr=pTm->pFirst; pHdr; pHdr=pHdr->pNext){
+ nLeak++;
+ nByte += pHdr->nByte;
+ }
+ if( pnLeakAlloc ) *pnLeakAlloc = nLeak;
+ if( pnLeakByte ) *pnLeakByte = nByte;
+
+#ifdef TM_BACKTRACE
+ if( pFile ){
+ int i;
+ fprintf(pFile, "LEAKS\n");
+ for(i=0; iaHash); i++){
+ TmAgg *pAgg;
+ for(pAgg=pTm->aHash[i]; pAgg; pAgg=pAgg->pNext){
+ if( pAgg->nOutAlloc ){
+ int j;
+ fprintf(pFile, "%d %d ", pAgg->nOutByte, pAgg->nOutAlloc);
+ for(j=0; jaFrame[j]);
+ }
+ fprintf(pFile, "\n");
+ }
+ }
+ }
+ fprintf(pFile, "\nALLOCATIONS\n");
+ for(i=0; iaHash); i++){
+ TmAgg *pAgg;
+ for(pAgg=pTm->aHash[i]; pAgg; pAgg=pAgg->pNext){
+ int j;
+ fprintf(pFile, "%d %d ", pAgg->nByte, pAgg->nAlloc);
+ for(j=0; jaFrame[j]);
+ fprintf(pFile, "\n");
+ }
+ }
+ }
+#else
+ (void)pFile;
+#endif
+}
+
+
+#include "lsm.h"
+#include "stdlib.h"
+
+typedef struct LsmMutex LsmMutex;
+struct LsmMutex {
+ lsm_env *pEnv;
+ lsm_mutex *pMutex;
+};
+
+static void tmLsmMutexEnter(TmGlobal *pTm){
+ LsmMutex *p = (LsmMutex *)pTm->pMutex;
+ p->pEnv->xMutexEnter(p->pMutex);
+}
+static void tmLsmMutexLeave(TmGlobal *pTm){
+ LsmMutex *p = (LsmMutex *)(pTm->pMutex);
+ p->pEnv->xMutexLeave(p->pMutex);
+}
+static void tmLsmMutexDel(TmGlobal *pTm){
+ LsmMutex *p = (LsmMutex *)pTm->pMutex;
+ pTm->xFree(p);
+}
+static void *tmLsmMalloc(int n){ return malloc(n); }
+static void tmLsmFree(void *ptr){ free(ptr); }
+static void *tmLsmRealloc(void *ptr, int n){ return realloc(ptr, n); }
+
+static void *tmLsmEnvMalloc(lsm_env *p, size_t n){
+ return tmMalloc((TmGlobal *)(p->pMemCtx), n);
+}
+static void tmLsmEnvFree(lsm_env *p, void *ptr){
+ tmFree((TmGlobal *)(p->pMemCtx), ptr);
+}
+static void *tmLsmEnvRealloc(lsm_env *p, void *ptr, size_t n){
+ return tmRealloc((TmGlobal *)(p->pMemCtx), ptr, n);
+}
+
+void testMallocInstall(lsm_env *pEnv){
+ TmGlobal *pGlobal;
+ LsmMutex *pMutex;
+ assert( pEnv->pMemCtx==0 );
+
+ /* Allocate and populate a TmGlobal structure. */
+ pGlobal = (TmGlobal *)tmLsmMalloc(sizeof(TmGlobal));
+ memset(pGlobal, 0, sizeof(TmGlobal));
+ pGlobal->xMalloc = tmLsmMalloc;
+ pGlobal->xRealloc = tmLsmRealloc;
+ pGlobal->xFree = tmLsmFree;
+ pMutex = (LsmMutex *)pGlobal->xMalloc(sizeof(LsmMutex));
+ pMutex->pEnv = pEnv;
+ pEnv->xMutexStatic(pEnv, LSM_MUTEX_HEAP, &pMutex->pMutex);
+ pGlobal->xEnterMutex = tmLsmMutexEnter;
+ pGlobal->xLeaveMutex = tmLsmMutexLeave;
+ pGlobal->xDelMutex = tmLsmMutexDel;
+ pGlobal->pMutex = (void *)pMutex;
+
+ pGlobal->xSaveMalloc = pEnv->xMalloc;
+ pGlobal->xSaveRealloc = pEnv->xRealloc;
+ pGlobal->xSaveFree = pEnv->xFree;
+
+ /* Set up pEnv to the use the new TmGlobal */
+ pEnv->pMemCtx = (void *)pGlobal;
+ pEnv->xMalloc = tmLsmEnvMalloc;
+ pEnv->xRealloc = tmLsmEnvRealloc;
+ pEnv->xFree = tmLsmEnvFree;
+}
+
+void testMallocUninstall(lsm_env *pEnv){
+ TmGlobal *p = (TmGlobal *)pEnv->pMemCtx;
+ pEnv->pMemCtx = 0;
+ if( p ){
+ pEnv->xMalloc = p->xSaveMalloc;
+ pEnv->xRealloc = p->xSaveRealloc;
+ pEnv->xFree = p->xSaveFree;
+ p->xDelMutex(p);
+ tmLsmFree(p);
+ }
+}
+
+void testMallocCheck(
+ lsm_env *pEnv,
+ int *pnLeakAlloc,
+ int *pnLeakByte,
+ FILE *pFile
+){
+ if( pEnv->pMemCtx==0 ){
+ *pnLeakAlloc = 0;
+ *pnLeakByte = 0;
+ }else{
+ tmMallocCheck((TmGlobal *)(pEnv->pMemCtx), pnLeakAlloc, pnLeakByte, pFile);
+ }
+}
+
+void testMallocOom(
+ lsm_env *pEnv,
+ int nCountdown,
+ int bPersist,
+ void (*xHook)(void *),
+ void *pHookCtx
+){
+ TmGlobal *pTm = (TmGlobal *)(pEnv->pMemCtx);
+ tmMallocOom(pTm, nCountdown, bPersist, xHook, pHookCtx);
+}
+
+void testMallocOomEnable(lsm_env *pEnv, int bEnable){
+ TmGlobal *pTm = (TmGlobal *)(pEnv->pMemCtx);
+ tmMallocOomEnable(pTm, bEnable);
+}
diff --git a/ext/lsm1/lsm-test/lsmtest_tdb.c b/ext/lsm1/lsm-test/lsmtest_tdb.c
new file mode 100644
index 0000000..8377bc2
--- /dev/null
+++ b/ext/lsm1/lsm-test/lsmtest_tdb.c
@@ -0,0 +1,845 @@
+
+/*
+** This program attempts to test the correctness of some facets of the
+** LSM database library. Specifically, that the contents of the database
+** are maintained correctly during a series of inserts and deletes.
+*/
+
+
+#include "lsmtest_tdb.h"
+#include "lsm.h"
+
+#include "lsmtest.h"
+
+#include
+#include
+#include
+#ifndef _WIN32
+# include
+#endif
+#include
+
+
+typedef struct SqlDb SqlDb;
+
+static int error_transaction_function(TestDb *p, int iLevel){
+ unused_parameter(p);
+ unused_parameter(iLevel);
+ return -1;
+}
+
+
+/*************************************************************************
+** Begin wrapper for LevelDB.
+*/
+#ifdef HAVE_LEVELDB
+
+#include
+
+typedef struct LevelDb LevelDb;
+struct LevelDb {
+ TestDb base;
+ leveldb_t *db;
+ leveldb_options_t *pOpt;
+ leveldb_writeoptions_t *pWriteOpt;
+ leveldb_readoptions_t *pReadOpt;
+
+ char *pVal;
+};
+
+static int test_leveldb_close(TestDb *pTestDb){
+ LevelDb *pDb = (LevelDb *)pTestDb;
+
+ leveldb_close(pDb->db);
+ leveldb_writeoptions_destroy(pDb->pWriteOpt);
+ leveldb_readoptions_destroy(pDb->pReadOpt);
+ leveldb_options_destroy(pDb->pOpt);
+ free(pDb->pVal);
+ free(pDb);
+
+ return 0;
+}
+
+static int test_leveldb_write(
+ TestDb *pTestDb,
+ void *pKey,
+ int nKey,
+ void *pVal,
+ int nVal
+){
+ LevelDb *pDb = (LevelDb *)pTestDb;
+ char *zErr = 0;
+ leveldb_put(pDb->db, pDb->pWriteOpt, pKey, nKey, pVal, nVal, &zErr);
+ return (zErr!=0);
+}
+
+static int test_leveldb_delete(TestDb *pTestDb, void *pKey, int nKey){
+ LevelDb *pDb = (LevelDb *)pTestDb;
+ char *zErr = 0;
+ leveldb_delete(pDb->db, pDb->pWriteOpt, pKey, nKey, &zErr);
+ return (zErr!=0);
+}
+
+static int test_leveldb_fetch(
+ TestDb *pTestDb,
+ void *pKey,
+ int nKey,
+ void **ppVal,
+ int *pnVal
+){
+ LevelDb *pDb = (LevelDb *)pTestDb;
+ char *zErr = 0;
+ size_t nVal = 0;
+
+ if( pKey==0 ) return 0;
+ free(pDb->pVal);
+ pDb->pVal = leveldb_get(pDb->db, pDb->pReadOpt, pKey, nKey, &nVal, &zErr);
+ *ppVal = (void *)(pDb->pVal);
+ if( pDb->pVal==0 ){
+ *pnVal = -1;
+ }else{
+ *pnVal = (int)nVal;
+ }
+
+ return (zErr!=0);
+}
+
+static int test_leveldb_scan(
+ TestDb *pTestDb,
+ void *pCtx,
+ int bReverse,
+ void *pKey1, int nKey1, /* Start of search */
+ void *pKey2, int nKey2, /* End of search */
+ void (*xCallback)(void *, void *, int , void *, int)
+){
+ LevelDb *pDb = (LevelDb *)pTestDb;
+ leveldb_iterator_t *iter;
+
+ iter = leveldb_create_iterator(pDb->db, pDb->pReadOpt);
+
+ if( bReverse==0 ){
+ if( pKey1 ){
+ leveldb_iter_seek(iter, pKey1, nKey1);
+ }else{
+ leveldb_iter_seek_to_first(iter);
+ }
+ }else{
+ if( pKey2 ){
+ leveldb_iter_seek(iter, pKey2, nKey2);
+
+ if( leveldb_iter_valid(iter)==0 ){
+ leveldb_iter_seek_to_last(iter);
+ }else{
+ const char *k; size_t n;
+ int res;
+ k = leveldb_iter_key(iter, &n);
+ res = memcmp(k, pKey2, MIN(n, nKey2));
+ if( res==0 ) res = n - nKey2;
+ assert( res>=0 );
+ if( res>0 ){
+ leveldb_iter_prev(iter);
+ }
+ }
+ }else{
+ leveldb_iter_seek_to_last(iter);
+ }
+ }
+
+
+ while( leveldb_iter_valid(iter) ){
+ const char *k; size_t n;
+ const char *v; size_t n2;
+ int res;
+
+ k = leveldb_iter_key(iter, &n);
+ if( bReverse==0 && pKey2 ){
+ res = memcmp(k, pKey2, MIN(n, nKey2));
+ if( res==0 ) res = n - nKey2;
+ if( res>0 ) break;
+ }
+ if( bReverse!=0 && pKey1 ){
+ res = memcmp(k, pKey1, MIN(n, nKey1));
+ if( res==0 ) res = n - nKey1;
+ if( res<0 ) break;
+ }
+
+ v = leveldb_iter_value(iter, &n2);
+
+ xCallback(pCtx, (void *)k, n, (void *)v, n2);
+
+ if( bReverse==0 ){
+ leveldb_iter_next(iter);
+ }else{
+ leveldb_iter_prev(iter);
+ }
+ }
+
+ leveldb_iter_destroy(iter);
+ return 0;
+}
+
+static int test_leveldb_open(
+ const char *zSpec,
+ const char *zFilename,
+ int bClear,
+ TestDb **ppDb
+){
+ static const DatabaseMethods LeveldbMethods = {
+ test_leveldb_close,
+ test_leveldb_write,
+ test_leveldb_delete,
+ 0,
+ test_leveldb_fetch,
+ test_leveldb_scan,
+ error_transaction_function,
+ error_transaction_function,
+ error_transaction_function
+ };
+
+ LevelDb *pLevelDb;
+ char *zErr = 0;
+
+ if( bClear ){
+ char *zCmd = sqlite3_mprintf("rm -rf %s\n", zFilename);
+ system(zCmd);
+ sqlite3_free(zCmd);
+ }
+
+ pLevelDb = (LevelDb *)malloc(sizeof(LevelDb));
+ memset(pLevelDb, 0, sizeof(LevelDb));
+
+ pLevelDb->pOpt = leveldb_options_create();
+ leveldb_options_set_create_if_missing(pLevelDb->pOpt, 1);
+ pLevelDb->pWriteOpt = leveldb_writeoptions_create();
+ pLevelDb->pReadOpt = leveldb_readoptions_create();
+
+ pLevelDb->db = leveldb_open(pLevelDb->pOpt, zFilename, &zErr);
+
+ if( zErr ){
+ test_leveldb_close((TestDb *)pLevelDb);
+ *ppDb = 0;
+ return 1;
+ }
+
+ *ppDb = (TestDb *)pLevelDb;
+ pLevelDb->base.pMethods = &LeveldbMethods;
+ return 0;
+}
+#endif /* HAVE_LEVELDB */
+/*
+** End wrapper for LevelDB.
+*************************************************************************/
+
+#ifdef HAVE_KYOTOCABINET
+static int kc_close(TestDb *pTestDb){
+ return test_kc_close(pTestDb);
+}
+
+static int kc_write(
+ TestDb *pTestDb,
+ void *pKey,
+ int nKey,
+ void *pVal,
+ int nVal
+){
+ return test_kc_write(pTestDb, pKey, nKey, pVal, nVal);
+}
+
+static int kc_delete(TestDb *pTestDb, void *pKey, int nKey){
+ return test_kc_delete(pTestDb, pKey, nKey);
+}
+
+static int kc_delete_range(
+ TestDb *pTestDb,
+ void *pKey1, int nKey1,
+ void *pKey2, int nKey2
+){
+ return test_kc_delete_range(pTestDb, pKey1, nKey1, pKey2, nKey2);
+}
+
+static int kc_fetch(
+ TestDb *pTestDb,
+ void *pKey,
+ int nKey,
+ void **ppVal,
+ int *pnVal
+){
+ if( pKey==0 ) return LSM_OK;
+ return test_kc_fetch(pTestDb, pKey, nKey, ppVal, pnVal);
+}
+
+static int kc_scan(
+ TestDb *pTestDb,
+ void *pCtx,
+ int bReverse,
+ void *pFirst, int nFirst,
+ void *pLast, int nLast,
+ void (*xCallback)(void *, void *, int , void *, int)
+){
+ return test_kc_scan(
+ pTestDb, pCtx, bReverse, pFirst, nFirst, pLast, nLast, xCallback
+ );
+}
+
+static int kc_open(
+ const char *zSpec,
+ const char *zFilename,
+ int bClear,
+ TestDb **ppDb
+){
+ static const DatabaseMethods KcdbMethods = {
+ kc_close,
+ kc_write,
+ kc_delete,
+ kc_delete_range,
+ kc_fetch,
+ kc_scan,
+ error_transaction_function,
+ error_transaction_function,
+ error_transaction_function
+ };
+
+ int rc;
+ TestDb *pTestDb = 0;
+
+ rc = test_kc_open(zFilename, bClear, &pTestDb);
+ if( rc!=0 ){
+ *ppDb = 0;
+ return rc;
+ }
+ pTestDb->pMethods = &KcdbMethods;
+ *ppDb = pTestDb;
+ return 0;
+}
+#endif /* HAVE_KYOTOCABINET */
+/*
+** End wrapper for Kyoto cabinet.
+*************************************************************************/
+
+#ifdef HAVE_MDB
+static int mdb_close(TestDb *pTestDb){
+ return test_mdb_close(pTestDb);
+}
+
+static int mdb_write(
+ TestDb *pTestDb,
+ void *pKey,
+ int nKey,
+ void *pVal,
+ int nVal
+){
+ return test_mdb_write(pTestDb, pKey, nKey, pVal, nVal);
+}
+
+static int mdb_delete(TestDb *pTestDb, void *pKey, int nKey){
+ return test_mdb_delete(pTestDb, pKey, nKey);
+}
+
+static int mdb_fetch(
+ TestDb *pTestDb,
+ void *pKey,
+ int nKey,
+ void **ppVal,
+ int *pnVal
+){
+ if( pKey==0 ) return LSM_OK;
+ return test_mdb_fetch(pTestDb, pKey, nKey, ppVal, pnVal);
+}
+
+static int mdb_scan(
+ TestDb *pTestDb,
+ void *pCtx,
+ int bReverse,
+ void *pFirst, int nFirst,
+ void *pLast, int nLast,
+ void (*xCallback)(void *, void *, int , void *, int)
+){
+ return test_mdb_scan(
+ pTestDb, pCtx, bReverse, pFirst, nFirst, pLast, nLast, xCallback
+ );
+}
+
+static int mdb_open(
+ const char *zSpec,
+ const char *zFilename,
+ int bClear,
+ TestDb **ppDb
+){
+ static const DatabaseMethods KcdbMethods = {
+ mdb_close,
+ mdb_write,
+ mdb_delete,
+ 0,
+ mdb_fetch,
+ mdb_scan,
+ error_transaction_function,
+ error_transaction_function,
+ error_transaction_function
+ };
+
+ int rc;
+ TestDb *pTestDb = 0;
+
+ rc = test_mdb_open(zSpec, zFilename, bClear, &pTestDb);
+ if( rc!=0 ){
+ *ppDb = 0;
+ return rc;
+ }
+ pTestDb->pMethods = &KcdbMethods;
+ *ppDb = pTestDb;
+ return 0;
+}
+#endif /* HAVE_MDB */
+
+/*************************************************************************
+** Begin wrapper for SQLite.
+*/
+
+/*
+** nOpenTrans:
+** The number of open nested transactions, in the same sense as used
+** by the tdb_begin/commit/rollback and SQLite 4 KV interfaces. If this
+** value is 0, there are no transactions open at all. If it is 1, then
+** there is a read transaction. If it is 2 or greater, then there are
+** (nOpenTrans-1) nested write transactions open.
+*/
+struct SqlDb {
+ TestDb base;
+ sqlite3 *db;
+ sqlite3_stmt *pInsert;
+ sqlite3_stmt *pDelete;
+ sqlite3_stmt *pDeleteRange;
+ sqlite3_stmt *pFetch;
+ sqlite3_stmt *apScan[8];
+
+ int nOpenTrans;
+
+ /* Used by sql_fetch() to allocate space for results */
+ int nAlloc;
+ u8 *aAlloc;
+};
+
+static int sql_close(TestDb *pTestDb){
+ SqlDb *pDb = (SqlDb *)pTestDb;
+ sqlite3_finalize(pDb->pInsert);
+ sqlite3_finalize(pDb->pDelete);
+ sqlite3_finalize(pDb->pDeleteRange);
+ sqlite3_finalize(pDb->pFetch);
+ sqlite3_finalize(pDb->apScan[0]);
+ sqlite3_finalize(pDb->apScan[1]);
+ sqlite3_finalize(pDb->apScan[2]);
+ sqlite3_finalize(pDb->apScan[3]);
+ sqlite3_finalize(pDb->apScan[4]);
+ sqlite3_finalize(pDb->apScan[5]);
+ sqlite3_finalize(pDb->apScan[6]);
+ sqlite3_finalize(pDb->apScan[7]);
+ sqlite3_close(pDb->db);
+ free((char *)pDb->aAlloc);
+ free((char *)pDb);
+ return SQLITE_OK;
+}
+
+static int sql_write(
+ TestDb *pTestDb,
+ void *pKey,
+ int nKey,
+ void *pVal,
+ int nVal
+){
+ SqlDb *pDb = (SqlDb *)pTestDb;
+ sqlite3_bind_blob(pDb->pInsert, 1, pKey, nKey, SQLITE_STATIC);
+ sqlite3_bind_blob(pDb->pInsert, 2, pVal, nVal, SQLITE_STATIC);
+ sqlite3_step(pDb->pInsert);
+ return sqlite3_reset(pDb->pInsert);
+}
+
+static int sql_delete(TestDb *pTestDb, void *pKey, int nKey){
+ SqlDb *pDb = (SqlDb *)pTestDb;
+ sqlite3_bind_blob(pDb->pDelete, 1, pKey, nKey, SQLITE_STATIC);
+ sqlite3_step(pDb->pDelete);
+ return sqlite3_reset(pDb->pDelete);
+}
+
+static int sql_delete_range(
+ TestDb *pTestDb,
+ void *pKey1, int nKey1,
+ void *pKey2, int nKey2
+){
+ SqlDb *pDb = (SqlDb *)pTestDb;
+ sqlite3_bind_blob(pDb->pDeleteRange, 1, pKey1, nKey1, SQLITE_STATIC);
+ sqlite3_bind_blob(pDb->pDeleteRange, 2, pKey2, nKey2, SQLITE_STATIC);
+ sqlite3_step(pDb->pDeleteRange);
+ return sqlite3_reset(pDb->pDeleteRange);
+}
+
+static int sql_fetch(
+ TestDb *pTestDb,
+ void *pKey,
+ int nKey,
+ void **ppVal,
+ int *pnVal
+){
+ SqlDb *pDb = (SqlDb *)pTestDb;
+ int rc;
+
+ sqlite3_reset(pDb->pFetch);
+ if( pKey==0 ){
+ assert( ppVal==0 );
+ assert( pnVal==0 );
+ return LSM_OK;
+ }
+
+ sqlite3_bind_blob(pDb->pFetch, 1, pKey, nKey, SQLITE_STATIC);
+ rc = sqlite3_step(pDb->pFetch);
+ if( rc==SQLITE_ROW ){
+ int nVal = sqlite3_column_bytes(pDb->pFetch, 0);
+ u8 *aVal = (void *)sqlite3_column_blob(pDb->pFetch, 0);
+
+ if( nVal>pDb->nAlloc ){
+ free(pDb->aAlloc);
+ pDb->aAlloc = (u8 *)malloc(nVal*2);
+ pDb->nAlloc = nVal*2;
+ }
+ memcpy(pDb->aAlloc, aVal, nVal);
+ *pnVal = nVal;
+ *ppVal = (void *)pDb->aAlloc;
+ }else{
+ *pnVal = -1;
+ *ppVal = 0;
+ }
+
+ rc = sqlite3_reset(pDb->pFetch);
+ return rc;
+}
+
+static int sql_scan(
+ TestDb *pTestDb,
+ void *pCtx,
+ int bReverse,
+ void *pFirst, int nFirst,
+ void *pLast, int nLast,
+ void (*xCallback)(void *, void *, int , void *, int)
+){
+ SqlDb *pDb = (SqlDb *)pTestDb;
+ sqlite3_stmt *pScan;
+
+ assert( bReverse==1 || bReverse==0 );
+ pScan = pDb->apScan[(pFirst==0) + (pLast==0)*2 + bReverse*4];
+
+ if( pFirst ) sqlite3_bind_blob(pScan, 1, pFirst, nFirst, SQLITE_STATIC);
+ if( pLast ) sqlite3_bind_blob(pScan, 2, pLast, nLast, SQLITE_STATIC);
+
+ while( SQLITE_ROW==sqlite3_step(pScan) ){
+ void *pKey; int nKey;
+ void *pVal; int nVal;
+
+ nKey = sqlite3_column_bytes(pScan, 0);
+ pKey = (void *)sqlite3_column_blob(pScan, 0);
+ nVal = sqlite3_column_bytes(pScan, 1);
+ pVal = (void *)sqlite3_column_blob(pScan, 1);
+
+ xCallback(pCtx, pKey, nKey, pVal, nVal);
+ }
+ return sqlite3_reset(pScan);
+}
+
+static int sql_begin(TestDb *pTestDb, int iLevel){
+ int i;
+ SqlDb *pDb = (SqlDb *)pTestDb;
+
+ /* iLevel==0 is a no-op */
+ if( iLevel==0 ) return 0;
+
+ /* If there are no transactions at all open, open a read transaction. */
+ if( pDb->nOpenTrans==0 ){
+ int rc = sqlite3_exec(pDb->db,
+ "BEGIN; SELECT * FROM sqlite_master LIMIT 1;" , 0, 0, 0
+ );
+ if( rc!=0 ) return rc;
+ pDb->nOpenTrans = 1;
+ }
+
+ /* Open any required write transactions */
+ for(i=pDb->nOpenTrans; idb, zSql, 0, 0, 0);
+ sqlite3_free(zSql);
+ if( rc!=SQLITE_OK ) return rc;
+ }
+
+ pDb->nOpenTrans = iLevel;
+ return 0;
+}
+
+static int sql_commit(TestDb *pTestDb, int iLevel){
+ SqlDb *pDb = (SqlDb *)pTestDb;
+ assert( iLevel>=0 );
+
+ /* Close the read transaction if requested. */
+ if( pDb->nOpenTrans>=1 && iLevel==0 ){
+ int rc = sqlite3_exec(pDb->db, "COMMIT", 0, 0, 0);
+ if( rc!=0 ) return rc;
+ pDb->nOpenTrans = 0;
+ }
+
+ /* Close write transactions as required */
+ if( pDb->nOpenTrans>iLevel ){
+ char *zSql = sqlite3_mprintf("RELEASE x%d", iLevel);
+ int rc = sqlite3_exec(pDb->db, zSql, 0, 0, 0);
+ sqlite3_free(zSql);
+ if( rc!=0 ) return rc;
+ }
+
+ pDb->nOpenTrans = iLevel;
+ return 0;
+}
+
+static int sql_rollback(TestDb *pTestDb, int iLevel){
+ SqlDb *pDb = (SqlDb *)pTestDb;
+ assert( iLevel>=0 );
+
+ if( pDb->nOpenTrans>=1 && iLevel==0 ){
+ /* Close the read transaction if requested. */
+ int rc = sqlite3_exec(pDb->db, "ROLLBACK", 0, 0, 0);
+ if( rc!=0 ) return rc;
+ }else if( pDb->nOpenTrans>1 && iLevel==1 ){
+ /* Or, rollback and close the top-level write transaction */
+ int rc = sqlite3_exec(pDb->db, "ROLLBACK TO x1; RELEASE x1;", 0, 0, 0);
+ if( rc!=0 ) return rc;
+ }else{
+ /* Or, just roll back some nested transactions */
+ char *zSql = sqlite3_mprintf("ROLLBACK TO x%d", iLevel-1);
+ int rc = sqlite3_exec(pDb->db, zSql, 0, 0, 0);
+ sqlite3_free(zSql);
+ if( rc!=0 ) return rc;
+ }
+
+ pDb->nOpenTrans = iLevel;
+ return 0;
+}
+
+static int sql_open(
+ const char *zSpec,
+ const char *zFilename,
+ int bClear,
+ TestDb **ppDb
+){
+ static const DatabaseMethods SqlMethods = {
+ sql_close,
+ sql_write,
+ sql_delete,
+ sql_delete_range,
+ sql_fetch,
+ sql_scan,
+ sql_begin,
+ sql_commit,
+ sql_rollback
+ };
+ const char *zCreate = "CREATE TABLE IF NOT EXISTS t1(k PRIMARY KEY, v)";
+ const char *zInsert = "REPLACE INTO t1 VALUES(?, ?)";
+ const char *zDelete = "DELETE FROM t1 WHERE k = ?";
+ const char *zRange = "DELETE FROM t1 WHERE k>? AND k";
+ const char *zFetch = "SELECT v FROM t1 WHERE k = ?";
+
+ const char *zScan0 = "SELECT * FROM t1 WHERE k BETWEEN ?1 AND ?2 ORDER BY k";
+ const char *zScan1 = "SELECT * FROM t1 WHERE k <= ?2 ORDER BY k";
+ const char *zScan2 = "SELECT * FROM t1 WHERE k >= ?1 ORDER BY k";
+ const char *zScan3 = "SELECT * FROM t1 ORDER BY k";
+
+ const char *zScan4 =
+ "SELECT * FROM t1 WHERE k BETWEEN ?1 AND ?2 ORDER BY k DESC";
+ const char *zScan5 = "SELECT * FROM t1 WHERE k <= ?2 ORDER BY k DESC";
+ const char *zScan6 = "SELECT * FROM t1 WHERE k >= ?1 ORDER BY k DESC";
+ const char *zScan7 = "SELECT * FROM t1 ORDER BY k DESC";
+
+ int rc;
+ SqlDb *pDb;
+ char *zPragma;
+
+ if( bClear && zFilename && zFilename[0] ){
+ unlink(zFilename);
+ }
+
+ pDb = (SqlDb *)malloc(sizeof(SqlDb));
+ memset(pDb, 0, sizeof(SqlDb));
+ pDb->base.pMethods = &SqlMethods;
+
+ if( 0!=(rc = sqlite3_open(zFilename, &pDb->db))
+ || 0!=(rc = sqlite3_exec(pDb->db, zCreate, 0, 0, 0))
+ || 0!=(rc = sqlite3_prepare_v2(pDb->db, zInsert, -1, &pDb->pInsert, 0))
+ || 0!=(rc = sqlite3_prepare_v2(pDb->db, zDelete, -1, &pDb->pDelete, 0))
+ || 0!=(rc = sqlite3_prepare_v2(pDb->db, zRange, -1, &pDb->pDeleteRange, 0))
+ || 0!=(rc = sqlite3_prepare_v2(pDb->db, zFetch, -1, &pDb->pFetch, 0))
+ || 0!=(rc = sqlite3_prepare_v2(pDb->db, zScan0, -1, &pDb->apScan[0], 0))
+ || 0!=(rc = sqlite3_prepare_v2(pDb->db, zScan1, -1, &pDb->apScan[1], 0))
+ || 0!=(rc = sqlite3_prepare_v2(pDb->db, zScan2, -1, &pDb->apScan[2], 0))
+ || 0!=(rc = sqlite3_prepare_v2(pDb->db, zScan3, -1, &pDb->apScan[3], 0))
+ || 0!=(rc = sqlite3_prepare_v2(pDb->db, zScan4, -1, &pDb->apScan[4], 0))
+ || 0!=(rc = sqlite3_prepare_v2(pDb->db, zScan5, -1, &pDb->apScan[5], 0))
+ || 0!=(rc = sqlite3_prepare_v2(pDb->db, zScan6, -1, &pDb->apScan[6], 0))
+ || 0!=(rc = sqlite3_prepare_v2(pDb->db, zScan7, -1, &pDb->apScan[7], 0))
+ ){
+ *ppDb = 0;
+ sql_close((TestDb *)pDb);
+ return rc;
+ }
+
+ zPragma = sqlite3_mprintf("PRAGMA page_size=%d", TESTDB_DEFAULT_PAGE_SIZE);
+ sqlite3_exec(pDb->db, zPragma, 0, 0, 0);
+ sqlite3_free(zPragma);
+ zPragma = sqlite3_mprintf("PRAGMA cache_size=%d", TESTDB_DEFAULT_CACHE_SIZE);
+ sqlite3_exec(pDb->db, zPragma, 0, 0, 0);
+ sqlite3_free(zPragma);
+
+ /* sqlite3_exec(pDb->db, "PRAGMA locking_mode=EXCLUSIVE", 0, 0, 0); */
+ sqlite3_exec(pDb->db, "PRAGMA synchronous=OFF", 0, 0, 0);
+ sqlite3_exec(pDb->db, "PRAGMA journal_mode=WAL", 0, 0, 0);
+ sqlite3_exec(pDb->db, "PRAGMA wal_autocheckpoint=4096", 0, 0, 0);
+ if( zSpec ){
+ rc = sqlite3_exec(pDb->db, zSpec, 0, 0, 0);
+ if( rc!=SQLITE_OK ){
+ sql_close((TestDb *)pDb);
+ return rc;
+ }
+ }
+
+ *ppDb = (TestDb *)pDb;
+ return 0;
+}
+/*
+** End wrapper for SQLite.
+*************************************************************************/
+
+/*************************************************************************
+** Begin exported functions.
+*/
+static struct Lib {
+ const char *zName;
+ const char *zDefaultDb;
+ int (*xOpen)(const char *, const char *zFilename, int bClear, TestDb **ppDb);
+} aLib[] = {
+ { "sqlite3", "testdb.sqlite", sql_open },
+ { "lsm_small", "testdb.lsm_small", test_lsm_small_open },
+ { "lsm_lomem", "testdb.lsm_lomem", test_lsm_lomem_open },
+#ifdef HAVE_ZLIB
+ { "lsm_zip", "testdb.lsm_zip", test_lsm_zip_open },
+#endif
+ { "lsm", "testdb.lsm", test_lsm_open },
+#ifdef LSM_MUTEX_PTHREADS
+ { "lsm_mt2", "testdb.lsm_mt2", test_lsm_mt2 },
+ { "lsm_mt3", "testdb.lsm_mt3", test_lsm_mt3 },
+#endif
+#ifdef HAVE_LEVELDB
+ { "leveldb", "testdb.leveldb", test_leveldb_open },
+#endif
+#ifdef HAVE_KYOTOCABINET
+ { "kyotocabinet", "testdb.kc", kc_open },
+#endif
+#ifdef HAVE_MDB
+ { "mdb", "./testdb.mdb", mdb_open }
+#endif
+};
+
+const char *tdb_system_name(int i){
+ if( i<0 || i>=ArraySize(aLib) ) return 0;
+ return aLib[i].zName;
+}
+
+const char *tdb_default_db(const char *zSys){
+ int i;
+ for(i=0; izLibrary = aLib[i].zName;
+ }
+ break;
+ }
+ }
+
+ if( rc ){
+ /* Failed to find the requested database library. Return an error. */
+ *ppDb = 0;
+ }
+ return rc;
+}
+
+int tdb_close(TestDb *pDb){
+ if( pDb ){
+ return pDb->pMethods->xClose(pDb);
+ }
+ return 0;
+}
+
+int tdb_write(TestDb *pDb, void *pKey, int nKey, void *pVal, int nVal){
+ return pDb->pMethods->xWrite(pDb, pKey, nKey, pVal, nVal);
+}
+
+int tdb_delete(TestDb *pDb, void *pKey, int nKey){
+ return pDb->pMethods->xDelete(pDb, pKey, nKey);
+}
+
+int tdb_delete_range(
+ TestDb *pDb, void *pKey1, int nKey1, void *pKey2, int nKey2
+){
+ return pDb->pMethods->xDeleteRange(pDb, pKey1, nKey1, pKey2, nKey2);
+}
+
+int tdb_fetch(TestDb *pDb, void *pKey, int nKey, void **ppVal, int *pnVal){
+ return pDb->pMethods->xFetch(pDb, pKey, nKey, ppVal, pnVal);
+}
+
+int tdb_scan(
+ TestDb *pDb, /* Database handle */
+ void *pCtx, /* Context pointer to pass to xCallback */
+ int bReverse, /* True to scan in reverse order */
+ void *pKey1, int nKey1, /* Start of search */
+ void *pKey2, int nKey2, /* End of search */
+ void (*xCallback)(void *pCtx, void *pKey, int nKey, void *pVal, int nVal)
+){
+ return pDb->pMethods->xScan(
+ pDb, pCtx, bReverse, pKey1, nKey1, pKey2, nKey2, xCallback
+ );
+}
+
+int tdb_begin(TestDb *pDb, int iLevel){
+ return pDb->pMethods->xBegin(pDb, iLevel);
+}
+int tdb_commit(TestDb *pDb, int iLevel){
+ return pDb->pMethods->xCommit(pDb, iLevel);
+}
+int tdb_rollback(TestDb *pDb, int iLevel){
+ return pDb->pMethods->xRollback(pDb, iLevel);
+}
+
+int tdb_transaction_support(TestDb *pDb){
+ return (pDb->pMethods->xBegin != error_transaction_function);
+}
+
+const char *tdb_library_name(TestDb *pDb){
+ return pDb->zLibrary;
+}
+
+/*
+** End exported functions.
+*************************************************************************/
diff --git a/ext/lsm1/lsm-test/lsmtest_tdb.h b/ext/lsm1/lsm-test/lsmtest_tdb.h
new file mode 100644
index 0000000..c55b6e2
--- /dev/null
+++ b/ext/lsm1/lsm-test/lsmtest_tdb.h
@@ -0,0 +1,174 @@
+
+/*
+** This file is the interface to a very simple database library used for
+** testing. The interface is similar to that of the LSM. The main virtue
+** of this library is that the same API may be used to access a key-value
+** store implemented by LSM, SQLite or another database system. Which
+** makes it easy to use for correctness and performance tests.
+*/
+
+#ifndef __WRAPPER_H_
+#define __WRAPPER_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "lsm.h"
+
+typedef struct TestDb TestDb;
+
+/*
+** Open a new database connection. The first argument is the name of the
+** database library to use. e.g. something like:
+**
+** "sqlite3"
+** "lsm"
+**
+** See function tdb_system_name() for a list of available database systems.
+**
+** The second argument is the name of the database to open (e.g. a filename).
+**
+** If the third parameter is non-zero, then any existing database by the
+** name of zDb is removed before opening a new one. If it is zero, then an
+** existing database may be opened.
+*/
+int tdb_open(const char *zLibrary, const char *zDb, int bClear, TestDb **ppDb);
+
+/*
+** Close a database handle.
+*/
+int tdb_close(TestDb *pDb);
+
+/*
+** Write a new key/value into the database.
+*/
+int tdb_write(TestDb *pDb, void *pKey, int nKey, void *pVal, int nVal);
+
+/*
+** Delete a key from the database.
+*/
+int tdb_delete(TestDb *pDb, void *pKey, int nKey);
+
+/*
+** Delete a range of keys from the database.
+*/
+int tdb_delete_range(TestDb *, void *pKey1, int nKey1, void *pKey2, int nKey2);
+
+/*
+** Query the database for key (pKey/nKey). If no entry is found, set *ppVal
+** to 0 and *pnVal to -1 before returning. Otherwise, set *ppVal and *pnVal
+** to a pointer to and size of the value associated with (pKey/nKey).
+*/
+int tdb_fetch(TestDb *pDb, void *pKey, int nKey, void **ppVal, int *pnVal);
+
+/*
+** Open and close nested transactions. Currently, these functions only
+** work for SQLite3 and LSM systems. Use the tdb_transaction_support()
+** function to determine if a given TestDb handle supports these methods.
+**
+** These functions and the iLevel parameter follow the same conventions as
+** the SQLite 4 transaction interface. Note that this is slightly different
+** from the way LSM does things. As follows:
+**
+** tdb_begin():
+** A successful call to tdb_begin() with (iLevel>1) guarantees that
+** there are at least (iLevel-1) write transactions open. If iLevel==1,
+** then it guarantees that at least a read-transaction is open. Calling
+** tdb_begin() with iLevel==0 is a no-op.
+**
+** tdb_commit():
+** A successful call to tdb_commit() with (iLevel>1) guarantees that
+** there are at most (iLevel-1) write transactions open. If iLevel==1,
+** then it guarantees that there are no write transactions open (although
+** a read-transaction may remain open). Calling tdb_commit() with
+** iLevel==0 ensures that all transactions, read or write, have been
+** closed and committed.
+**
+** tdb_rollback():
+** This call is similar to tdb_commit(), except that instead of committing
+** transactions, it reverts them. For example, calling tdb_rollback() with
+** iLevel==2 ensures that there is at most one write transaction open, and
+** restores the database to the state that it was in when that transaction
+** was opened.
+**
+** In other words, tdb_commit() just closes transactions - tdb_rollback()
+** closes transactions and then restores the database to the state it
+** was in before those transactions were even opened.
+*/
+int tdb_begin(TestDb *pDb, int iLevel);
+int tdb_commit(TestDb *pDb, int iLevel);
+int tdb_rollback(TestDb *pDb, int iLevel);
+
+/*
+** Return true if transactions are supported, or false otherwise.
+*/
+int tdb_transaction_support(TestDb *pDb);
+
+/*
+** Return the name of the database library (as passed to tdb_open()) used
+** by the handled passed as the first argument.
+*/
+const char *tdb_library_name(TestDb *pDb);
+
+/*
+** Scan a range of database keys. Invoke the callback function for each
+** key visited.
+*/
+int tdb_scan(
+ TestDb *pDb, /* Database handle */
+ void *pCtx, /* Context pointer to pass to xCallback */
+ int bReverse, /* True to scan in reverse order */
+ void *pKey1, int nKey1, /* Start of search */
+ void *pKey2, int nKey2, /* End of search */
+ void (*xCallback)(void *pCtx, void *pKey, int nKey, void *pVal, int nVal)
+);
+
+const char *tdb_system_name(int i);
+const char *tdb_default_db(const char *zSys);
+
+int tdb_lsm_open(const char *zCfg, const char *zDb, int bClear, TestDb **ppDb);
+
+/*
+** If the TestDb handle passed as an argument is a wrapper around an LSM
+** database, return the LSM handle. Otherwise, if the argument is some other
+** database system, return NULL.
+*/
+lsm_db *tdb_lsm(TestDb *pDb);
+
+/*
+** Return true if the db passed as an argument is a multi-threaded LSM
+** connection.
+*/
+int tdb_lsm_multithread(TestDb *pDb);
+
+/*
+** Return a pointer to the lsm_env object used by all lsm database
+** connections initialized as a copy of the object returned by
+** lsm_default_env(). It may be modified (e.g. to override functions)
+** if the caller can guarantee that it is not already in use.
+*/
+lsm_env *tdb_lsm_env(void);
+
+/*
+** The following functions only work with LSM database handles. It is
+** illegal to call them with any other type of database handle specified
+** as an argument.
+*/
+void tdb_lsm_enable_log(TestDb *pDb, int bEnable);
+void tdb_lsm_application_crash(TestDb *pDb);
+void tdb_lsm_prepare_system_crash(TestDb *pDb);
+void tdb_lsm_system_crash(TestDb *pDb);
+void tdb_lsm_prepare_sync_crash(TestDb *pDb, int iSync);
+
+
+void tdb_lsm_safety(TestDb *pDb, int eMode);
+void tdb_lsm_config_work_hook(TestDb *pDb, void (*)(lsm_db *, void *), void *);
+void tdb_lsm_write_hook(TestDb *, void(*)(void*,int,lsm_i64,int,int), void*);
+int tdb_lsm_config_str(TestDb *pDb, const char *zStr);
+
+#ifdef __cplusplus
+} /* End of the 'extern "C"' block */
+#endif
+
+#endif
diff --git a/ext/lsm1/lsm-test/lsmtest_tdb2.cc b/ext/lsm1/lsm-test/lsmtest_tdb2.cc
new file mode 100644
index 0000000..307c2b5
--- /dev/null
+++ b/ext/lsm1/lsm-test/lsmtest_tdb2.cc
@@ -0,0 +1,370 @@
+
+
+#include "lsmtest.h"
+#include
+
+#ifdef HAVE_KYOTOCABINET
+#include "kcpolydb.h"
+extern "C" {
+ struct KcDb {
+ TestDb base;
+ kyotocabinet::TreeDB* db;
+ char *pVal;
+ };
+}
+
+int test_kc_open(const char *zFilename, int bClear, TestDb **ppDb){
+ KcDb *pKcDb;
+ int ok;
+ int rc = 0;
+
+ if( bClear ){
+ char *zCmd = sqlite3_mprintf("rm -rf %s\n", zFilename);
+ system(zCmd);
+ sqlite3_free(zCmd);
+ }
+
+ pKcDb = (KcDb *)malloc(sizeof(KcDb));
+ memset(pKcDb, 0, sizeof(KcDb));
+
+
+ pKcDb->db = new kyotocabinet::TreeDB();
+ pKcDb->db->tune_page(TESTDB_DEFAULT_PAGE_SIZE);
+ pKcDb->db->tune_page_cache(
+ TESTDB_DEFAULT_PAGE_SIZE * TESTDB_DEFAULT_CACHE_SIZE
+ );
+ ok = pKcDb->db->open(zFilename,
+ kyotocabinet::PolyDB::OWRITER | kyotocabinet::PolyDB::OCREATE
+ );
+ if( ok==0 ){
+ free(pKcDb);
+ pKcDb = 0;
+ rc = 1;
+ }
+
+ *ppDb = (TestDb *)pKcDb;
+ return rc;
+}
+
+int test_kc_close(TestDb *pDb){
+ KcDb *pKcDb = (KcDb *)pDb;
+ if( pKcDb->pVal ){
+ delete [] pKcDb->pVal;
+ }
+ pKcDb->db->close();
+ delete pKcDb->db;
+ free(pKcDb);
+ return 0;
+}
+
+int test_kc_write(TestDb *pDb, void *pKey, int nKey, void *pVal, int nVal){
+ KcDb *pKcDb = (KcDb *)pDb;
+ int ok;
+
+ ok = pKcDb->db->set((const char *)pKey, nKey, (const char *)pVal, nVal);
+ return (ok ? 0 : 1);
+}
+
+int test_kc_delete(TestDb *pDb, void *pKey, int nKey){
+ KcDb *pKcDb = (KcDb *)pDb;
+ int ok;
+
+ ok = pKcDb->db->remove((const char *)pKey, nKey);
+ return (ok ? 0 : 1);
+}
+
+int test_kc_delete_range(
+ TestDb *pDb,
+ void *pKey1, int nKey1,
+ void *pKey2, int nKey2
+){
+ int res;
+ KcDb *pKcDb = (KcDb *)pDb;
+ kyotocabinet::DB::Cursor* pCur = pKcDb->db->cursor();
+
+ if( pKey1 ){
+ res = pCur->jump((const char *)pKey1, nKey1);
+ }else{
+ res = pCur->jump();
+ }
+
+ while( 1 ){
+ const char *pKey; size_t nKey;
+ const char *pVal; size_t nVal;
+
+ pKey = pCur->get(&nKey, &pVal, &nVal);
+ if( pKey==0 ) break;
+
+#ifndef NDEBUG
+ if( pKey1 ){
+ res = memcmp(pKey, pKey1, MIN((size_t)nKey1, nKey));
+ assert( res>0 || (res==0 && nKey>nKey1) );
+ }
+#endif
+
+ if( pKey2 ){
+ res = memcmp(pKey, pKey2, MIN((size_t)nKey2, nKey));
+ if( res>0 || (res==0 && (size_t)nKey2remove();
+ delete [] pKey;
+ }
+
+ delete pCur;
+ return 0;
+}
+
+int test_kc_fetch(
+ TestDb *pDb,
+ void *pKey,
+ int nKey,
+ void **ppVal,
+ int *pnVal
+){
+ KcDb *pKcDb = (KcDb *)pDb;
+ size_t nVal;
+
+ if( pKcDb->pVal ){
+ delete [] pKcDb->pVal;
+ pKcDb->pVal = 0;
+ }
+
+ pKcDb->pVal = pKcDb->db->get((const char *)pKey, nKey, &nVal);
+ if( pKcDb->pVal ){
+ *ppVal = pKcDb->pVal;
+ *pnVal = nVal;
+ }else{
+ *ppVal = 0;
+ *pnVal = -1;
+ }
+
+ return 0;
+}
+
+int test_kc_scan(
+ TestDb *pDb, /* Database handle */
+ void *pCtx, /* Context pointer to pass to xCallback */
+ int bReverse, /* True for a reverse order scan */
+ void *pKey1, int nKey1, /* Start of search */
+ void *pKey2, int nKey2, /* End of search */
+ void (*xCallback)(void *pCtx, void *pKey, int nKey, void *pVal, int nVal)
+){
+ KcDb *pKcDb = (KcDb *)pDb;
+ kyotocabinet::DB::Cursor* pCur = pKcDb->db->cursor();
+ int res;
+
+ if( bReverse==0 ){
+ if( pKey1 ){
+ res = pCur->jump((const char *)pKey1, nKey1);
+ }else{
+ res = pCur->jump();
+ }
+ }else{
+ if( pKey2 ){
+ res = pCur->jump_back((const char *)pKey2, nKey2);
+ }else{
+ res = pCur->jump_back();
+ }
+ }
+
+ while( res ){
+ const char *pKey; size_t nKey;
+ const char *pVal; size_t nVal;
+ pKey = pCur->get(&nKey, &pVal, &nVal);
+
+ if( bReverse==0 && pKey2 ){
+ res = memcmp(pKey, pKey2, MIN((size_t)nKey2, nKey));
+ if( res>0 || (res==0 && (size_t)nKey2nKey) ){
+ delete [] pKey;
+ break;
+ }
+ }
+
+ xCallback(pCtx, (void *)pKey, (int)nKey, (void *)pVal, (int)nVal);
+ delete [] pKey;
+
+ if( bReverse ){
+ res = pCur->step_back();
+ }else{
+ res = pCur->step();
+ }
+ }
+
+ delete pCur;
+ return 0;
+}
+#endif /* HAVE_KYOTOCABINET */
+
+#ifdef HAVE_MDB
+#include "lmdb.h"
+
+extern "C" {
+ struct MdbDb {
+ TestDb base;
+ MDB_env *env;
+ MDB_dbi dbi;
+ };
+}
+
+int test_mdb_open(
+ const char *zSpec,
+ const char *zFilename,
+ int bClear,
+ TestDb **ppDb
+){
+ MDB_txn *txn;
+ MdbDb *pMdb;
+ int rc;
+
+ if( bClear ){
+ char *zCmd = sqlite3_mprintf("rm -rf %s\n", zFilename);
+ system(zCmd);
+ sqlite3_free(zCmd);
+ }
+
+ pMdb = (MdbDb *)malloc(sizeof(MdbDb));
+ memset(pMdb, 0, sizeof(MdbDb));
+
+ rc = mdb_env_create(&pMdb->env);
+ if( rc==0 ) rc = mdb_env_set_mapsize(pMdb->env, 1*1024*1024*1024);
+ if( rc==0 ) rc = mdb_env_open(pMdb->env, zFilename, MDB_NOSYNC|MDB_NOSUBDIR, 0600);
+ if( rc==0 ) rc = mdb_txn_begin(pMdb->env, NULL, 0, &txn);
+ if( rc==0 ){
+ rc = mdb_open(txn, NULL, 0, &pMdb->dbi);
+ mdb_txn_commit(txn);
+ }
+
+ *ppDb = (TestDb *)pMdb;
+ return rc;
+}
+
+int test_mdb_close(TestDb *pDb){
+ MdbDb *pMdb = (MdbDb *)pDb;
+
+ mdb_close(pMdb->env, pMdb->dbi);
+ mdb_env_close(pMdb->env);
+ free(pMdb);
+ return 0;
+}
+
+int test_mdb_write(TestDb *pDb, void *pKey, int nKey, void *pVal, int nVal){
+ int rc;
+ MdbDb *pMdb = (MdbDb *)pDb;
+ MDB_val val;
+ MDB_val key;
+ MDB_txn *txn;
+
+ val.mv_size = nVal;
+ val.mv_data = pVal;
+ key.mv_size = nKey;
+ key.mv_data = pKey;
+
+ rc = mdb_txn_begin(pMdb->env, NULL, 0, &txn);
+ if( rc==0 ){
+ rc = mdb_put(txn, pMdb->dbi, &key, &val, 0);
+ if( rc==0 ){
+ rc = mdb_txn_commit(txn);
+ }else{
+ mdb_txn_abort(txn);
+ }
+ }
+
+ return rc;
+}
+
+int test_mdb_delete(TestDb *pDb, void *pKey, int nKey){
+ int rc;
+ MdbDb *pMdb = (MdbDb *)pDb;
+ MDB_val key;
+ MDB_txn *txn;
+
+ key.mv_size = nKey;
+ key.mv_data = pKey;
+ rc = mdb_txn_begin(pMdb->env, NULL, 0, &txn);
+ if( rc==0 ){
+ rc = mdb_del(txn, pMdb->dbi, &key, 0);
+ if( rc==0 ){
+ rc = mdb_txn_commit(txn);
+ }else{
+ mdb_txn_abort(txn);
+ }
+ }
+
+ return rc;
+}
+
+int test_mdb_fetch(
+ TestDb *pDb,
+ void *pKey,
+ int nKey,
+ void **ppVal,
+ int *pnVal
+){
+ int rc;
+ MdbDb *pMdb = (MdbDb *)pDb;
+ MDB_val key;
+ MDB_txn *txn;
+
+ key.mv_size = nKey;
+ key.mv_data = pKey;
+
+ rc = mdb_txn_begin(pMdb->env, NULL, MDB_RDONLY, &txn);
+ if( rc==0 ){
+ MDB_val val = {0, 0};
+ rc = mdb_get(txn, pMdb->dbi, &key, &val);
+ if( rc==MDB_NOTFOUND ){
+ rc = 0;
+ *ppVal = 0;
+ *pnVal = -1;
+ }else{
+ *ppVal = val.mv_data;
+ *pnVal = val.mv_size;
+ }
+ mdb_txn_commit(txn);
+ }
+
+ return rc;
+}
+
+int test_mdb_scan(
+ TestDb *pDb, /* Database handle */
+ void *pCtx, /* Context pointer to pass to xCallback */
+ int bReverse, /* True for a reverse order scan */
+ void *pKey1, int nKey1, /* Start of search */
+ void *pKey2, int nKey2, /* End of search */
+ void (*xCallback)(void *pCtx, void *pKey, int nKey, void *pVal, int nVal)
+){
+ MdbDb *pMdb = (MdbDb *)pDb;
+ int rc;
+ MDB_cursor_op op = bReverse ? MDB_PREV : MDB_NEXT;
+ MDB_txn *txn;
+
+ rc = mdb_txn_begin(pMdb->env, NULL, MDB_RDONLY, &txn);
+ if( rc==0 ){
+ MDB_cursor *csr;
+ MDB_val key = {0, 0};
+ MDB_val val = {0, 0};
+
+ rc = mdb_cursor_open(txn, pMdb->dbi, &csr);
+ if( rc==0 ){
+ while( mdb_cursor_get(csr, &key, &val, op)==0 ){
+ xCallback(pCtx, key.mv_data, key.mv_size, val.mv_data, val.mv_size);
+ }
+ mdb_cursor_close(csr);
+ }
+ }
+
+ return rc;
+}
+
+#endif /* HAVE_MDB */
+
diff --git a/ext/lsm1/lsm-test/lsmtest_tdb3.c b/ext/lsm1/lsm-test/lsmtest_tdb3.c
new file mode 100644
index 0000000..1862023
--- /dev/null
+++ b/ext/lsm1/lsm-test/lsmtest_tdb3.c
@@ -0,0 +1,1389 @@
+
+#include "lsmtest_tdb.h"
+#include "lsm.h"
+#include "lsmtest.h"
+
+#include
+#include
+#include
+#ifndef _WIN32
+# include
+#endif
+#include
+
+#ifndef _WIN32
+# include
+#endif
+
+typedef struct LsmDb LsmDb;
+typedef struct LsmWorker LsmWorker;
+typedef struct LsmFile LsmFile;
+
+#define LSMTEST_DFLT_MT_MAX_CKPT (8*1024)
+#define LSMTEST_DFLT_MT_MIN_CKPT (2*1024)
+
+#ifdef LSM_MUTEX_PTHREADS
+#include
+
+#define LSMTEST_THREAD_CKPT 1
+#define LSMTEST_THREAD_WORKER 2
+#define LSMTEST_THREAD_WORKER_AC 3
+
+/*
+** There are several different types of worker threads that run in different
+** test configurations, depending on the value of LsmWorker.eType.
+**
+** 1. Checkpointer.
+** 2. Worker with auto-checkpoint.
+** 3. Worker without auto-checkpoint.
+*/
+struct LsmWorker {
+ LsmDb *pDb; /* Main database structure */
+ lsm_db *pWorker; /* Worker database handle */
+ pthread_t worker_thread; /* Worker thread */
+ pthread_cond_t worker_cond; /* Condition var the worker waits on */
+ pthread_mutex_t worker_mutex; /* Mutex used with worker_cond */
+ int bDoWork; /* Set to true by client when there is work */
+ int worker_rc; /* Store error code here */
+ int eType; /* LSMTEST_THREAD_XXX constant */
+ int bBlock;
+};
+#else
+struct LsmWorker { int worker_rc; int bBlock; };
+#endif
+
+static void mt_shutdown(LsmDb *);
+
+lsm_env *tdb_lsm_env(void){
+ static int bInit = 0;
+ static lsm_env env;
+ if( bInit==0 ){
+ memcpy(&env, lsm_default_env(), sizeof(env));
+ bInit = 1;
+ }
+ return &env;
+}
+
+typedef struct FileSector FileSector;
+typedef struct FileData FileData;
+
+struct FileSector {
+ u8 *aOld; /* Old data for this sector */
+};
+
+struct FileData {
+ int nSector; /* Allocated size of apSector[] array */
+ FileSector *aSector; /* Array of file sectors */
+};
+
+/*
+** bPrepareCrash:
+** If non-zero, the file wrappers maintain enough in-memory data to
+** simulate the effect of a power-failure on the file-system (i.e. that
+** unsynced sectors may be written, not written, or overwritten with
+** arbitrary data when the crash occurs).
+**
+** bCrashed:
+** Set to true after a crash is simulated. Once this variable is true, all
+** VFS methods other than xClose() return LSM_IOERR as soon as they are
+** called (without affecting the contents of the file-system).
+**
+** env:
+** The environment object used by all lsm_db* handles opened by this
+** object (i.e. LsmDb.db plus any worker connections). Variable env.pVfsCtx
+** always points to the containing LsmDb structure.
+*/
+struct LsmDb {
+ TestDb base; /* Base class - methods table */
+ lsm_env env; /* Environment used by connection db */
+ char *zName; /* Database file name */
+ lsm_db *db; /* LSM database handle */
+
+ lsm_cursor *pCsr; /* Cursor held open during read transaction */
+ void *pBuf; /* Buffer for tdb_fetch() output */
+ int nBuf; /* Allocated (not used) size of pBuf */
+
+ /* Crash testing related state */
+ int bCrashed; /* True once a crash has occurred */
+ int nAutoCrash; /* Number of syncs until a crash */
+ int bPrepareCrash; /* True to store writes in memory */
+
+ /* Unsynced data (while crash testing) */
+ int szSector; /* Assumed size of disk sectors (512B) */
+ FileData aFile[2]; /* Database and log file data */
+
+ /* Other test instrumentation */
+ int bNoRecovery; /* If true, assume DMS2 is locked */
+
+ /* Work hook redirection */
+ void (*xWork)(lsm_db *, void *);
+ void *pWorkCtx;
+
+ /* IO logging hook */
+ void (*xWriteHook)(void *, int, lsm_i64, int, int);
+ void *pWriteCtx;
+
+ /* Worker threads (for lsm_mt) */
+ int nMtMinCkpt;
+ int nMtMaxCkpt;
+ int eMode;
+ int nWorker;
+ LsmWorker *aWorker;
+};
+
+#define LSMTEST_MODE_SINGLETHREAD 1
+#define LSMTEST_MODE_BACKGROUND_CKPT 2
+#define LSMTEST_MODE_BACKGROUND_WORK 3
+#define LSMTEST_MODE_BACKGROUND_BOTH 4
+
+/*************************************************************************
+**************************************************************************
+** Begin test VFS code.
+*/
+
+struct LsmFile {
+ lsm_file *pReal; /* Real underlying file */
+ int bLog; /* True for log file. False for db file */
+ LsmDb *pDb; /* Database handle that uses this file */
+};
+
+static int testEnvFullpath(
+ lsm_env *pEnv, /* Environment for current LsmDb */
+ const char *zFile, /* Relative path name */
+ char *zOut, /* Output buffer */
+ int *pnOut /* IN/OUT: Size of output buffer */
+){
+ lsm_env *pRealEnv = tdb_lsm_env();
+ return pRealEnv->xFullpath(pRealEnv, zFile, zOut, pnOut);
+}
+
+static int testEnvOpen(
+ lsm_env *pEnv, /* Environment for current LsmDb */
+ const char *zFile, /* Name of file to open */
+ int flags,
+ lsm_file **ppFile /* OUT: New file handle object */
+){
+ lsm_env *pRealEnv = tdb_lsm_env();
+ LsmDb *pDb = (LsmDb *)pEnv->pVfsCtx;
+ int rc; /* Return Code */
+ LsmFile *pRet; /* The new file handle */
+ int nFile; /* Length of string zFile in bytes */
+
+ nFile = strlen(zFile);
+ pRet = (LsmFile *)testMalloc(sizeof(LsmFile));
+ pRet->pDb = pDb;
+ pRet->bLog = (nFile > 4 && 0==memcmp("-log", &zFile[nFile-4], 4));
+
+ rc = pRealEnv->xOpen(pRealEnv, zFile, flags, &pRet->pReal);
+ if( rc!=LSM_OK ){
+ testFree(pRet);
+ pRet = 0;
+ }
+
+ *ppFile = (lsm_file *)pRet;
+ return rc;
+}
+
+static int testEnvRead(lsm_file *pFile, lsm_i64 iOff, void *pData, int nData){
+ lsm_env *pRealEnv = tdb_lsm_env();
+ LsmFile *p = (LsmFile *)pFile;
+ if( p->pDb->bCrashed ) return LSM_IOERR;
+ return pRealEnv->xRead(p->pReal, iOff, pData, nData);
+}
+
+static int testEnvWrite(lsm_file *pFile, lsm_i64 iOff, void *pData, int nData){
+ lsm_env *pRealEnv = tdb_lsm_env();
+ LsmFile *p = (LsmFile *)pFile;
+ LsmDb *pDb = p->pDb;
+
+ if( pDb->bCrashed ) return LSM_IOERR;
+
+ if( pDb->bPrepareCrash ){
+ FileData *pData2 = &pDb->aFile[p->bLog];
+ int iFirst;
+ int iLast;
+ int iSector;
+
+ iFirst = (int)(iOff / pDb->szSector);
+ iLast = (int)((iOff + nData - 1) / pDb->szSector);
+
+ if( pData2->nSector<(iLast+1) ){
+ int nNew = ( ((iLast + 1) + 63) / 64 ) * 64;
+ assert( nNew>iLast );
+ pData2->aSector = (FileSector *)testRealloc(
+ pData2->aSector, nNew*sizeof(FileSector)
+ );
+ memset(&pData2->aSector[pData2->nSector],
+ 0, (nNew - pData2->nSector) * sizeof(FileSector)
+ );
+ pData2->nSector = nNew;
+ }
+
+ for(iSector=iFirst; iSector<=iLast; iSector++){
+ if( pData2->aSector[iSector].aOld==0 ){
+ u8 *aOld = (u8 *)testMalloc(pDb->szSector);
+ pRealEnv->xRead(
+ p->pReal, (lsm_i64)iSector*pDb->szSector, aOld, pDb->szSector
+ );
+ pData2->aSector[iSector].aOld = aOld;
+ }
+ }
+ }
+
+ if( pDb->xWriteHook ){
+ int rc;
+ int nUs;
+ struct timeval t1;
+ struct timeval t2;
+
+ gettimeofday(&t1, 0);
+ assert( nData>0 );
+ rc = pRealEnv->xWrite(p->pReal, iOff, pData, nData);
+ gettimeofday(&t2, 0);
+
+ nUs = (t2.tv_sec - t1.tv_sec) * 1000000 + (t2.tv_usec - t1.tv_usec);
+ pDb->xWriteHook(pDb->pWriteCtx, p->bLog, iOff, nData, nUs);
+ return rc;
+ }
+
+ return pRealEnv->xWrite(p->pReal, iOff, pData, nData);
+}
+
+static void doSystemCrash(LsmDb *pDb);
+
+static int testEnvSync(lsm_file *pFile){
+ lsm_env *pRealEnv = tdb_lsm_env();
+ LsmFile *p = (LsmFile *)pFile;
+ LsmDb *pDb = p->pDb;
+ FileData *pData = &pDb->aFile[p->bLog];
+ int i;
+
+ if( pDb->bCrashed ) return LSM_IOERR;
+
+ if( pDb->nAutoCrash ){
+ pDb->nAutoCrash--;
+ if( pDb->nAutoCrash==0 ){
+ doSystemCrash(pDb);
+ pDb->bCrashed = 1;
+ return LSM_IOERR;
+ }
+ }
+
+ if( pDb->bPrepareCrash ){
+ for(i=0; inSector; i++){
+ testFree(pData->aSector[i].aOld);
+ pData->aSector[i].aOld = 0;
+ }
+ }
+
+ if( pDb->xWriteHook ){
+ int rc;
+ int nUs;
+ struct timeval t1;
+ struct timeval t2;
+
+ gettimeofday(&t1, 0);
+ rc = pRealEnv->xSync(p->pReal);
+ gettimeofday(&t2, 0);
+
+ nUs = (t2.tv_sec - t1.tv_sec) * 1000000 + (t2.tv_usec - t1.tv_usec);
+ pDb->xWriteHook(pDb->pWriteCtx, p->bLog, 0, 0, nUs);
+ return rc;
+ }
+
+ return pRealEnv->xSync(p->pReal);
+}
+
+static int testEnvTruncate(lsm_file *pFile, lsm_i64 iOff){
+ lsm_env *pRealEnv = tdb_lsm_env();
+ LsmFile *p = (LsmFile *)pFile;
+ if( p->pDb->bCrashed ) return LSM_IOERR;
+ return pRealEnv->xTruncate(p->pReal, iOff);
+}
+
+static int testEnvSectorSize(lsm_file *pFile){
+ lsm_env *pRealEnv = tdb_lsm_env();
+ LsmFile *p = (LsmFile *)pFile;
+ return pRealEnv->xSectorSize(p->pReal);
+}
+
+static int testEnvRemap(
+ lsm_file *pFile,
+ lsm_i64 iMin,
+ void **ppOut,
+ lsm_i64 *pnOut
+){
+ lsm_env *pRealEnv = tdb_lsm_env();
+ LsmFile *p = (LsmFile *)pFile;
+ return pRealEnv->xRemap(p->pReal, iMin, ppOut, pnOut);
+}
+
+static int testEnvFileid(
+ lsm_file *pFile,
+ void *ppOut,
+ int *pnOut
+){
+ lsm_env *pRealEnv = tdb_lsm_env();
+ LsmFile *p = (LsmFile *)pFile;
+ return pRealEnv->xFileid(p->pReal, ppOut, pnOut);
+}
+
+static int testEnvClose(lsm_file *pFile){
+ lsm_env *pRealEnv = tdb_lsm_env();
+ LsmFile *p = (LsmFile *)pFile;
+
+ pRealEnv->xClose(p->pReal);
+ testFree(p);
+ return LSM_OK;
+}
+
+static int testEnvUnlink(lsm_env *pEnv, const char *zFile){
+ lsm_env *pRealEnv = tdb_lsm_env();
+ unused_parameter(pEnv);
+ return pRealEnv->xUnlink(pRealEnv, zFile);
+}
+
+static int testEnvLock(lsm_file *pFile, int iLock, int eType){
+ LsmFile *p = (LsmFile *)pFile;
+ lsm_env *pRealEnv = tdb_lsm_env();
+
+ if( iLock==2 && eType==LSM_LOCK_EXCL && p->pDb->bNoRecovery ){
+ return LSM_BUSY;
+ }
+ return pRealEnv->xLock(p->pReal, iLock, eType);
+}
+
+static int testEnvTestLock(lsm_file *pFile, int iLock, int nLock, int eType){
+ LsmFile *p = (LsmFile *)pFile;
+ lsm_env *pRealEnv = tdb_lsm_env();
+
+ if( iLock==2 && eType==LSM_LOCK_EXCL && p->pDb->bNoRecovery ){
+ return LSM_BUSY;
+ }
+ return pRealEnv->xTestLock(p->pReal, iLock, nLock, eType);
+}
+
+static int testEnvShmMap(lsm_file *pFile, int iRegion, int sz, void **pp){
+ LsmFile *p = (LsmFile *)pFile;
+ lsm_env *pRealEnv = tdb_lsm_env();
+ return pRealEnv->xShmMap(p->pReal, iRegion, sz, pp);
+}
+
+static void testEnvShmBarrier(void){
+}
+
+static int testEnvShmUnmap(lsm_file *pFile, int bDel){
+ LsmFile *p = (LsmFile *)pFile;
+ lsm_env *pRealEnv = tdb_lsm_env();
+ return pRealEnv->xShmUnmap(p->pReal, bDel);
+}
+
+static int testEnvSleep(lsm_env *pEnv, int us){
+ lsm_env *pRealEnv = tdb_lsm_env();
+ return pRealEnv->xSleep(pRealEnv, us);
+}
+
+static void doSystemCrash(LsmDb *pDb){
+ lsm_env *pEnv = tdb_lsm_env();
+ int iFile;
+ int iSeed = pDb->aFile[0].nSector + pDb->aFile[1].nSector;
+
+ char *zFile = pDb->zName;
+ char *zFree = 0;
+
+ for(iFile=0; iFile<2; iFile++){
+ lsm_file *pFile = 0;
+ int i;
+
+ pEnv->xOpen(pEnv, zFile, 0, &pFile);
+ for(i=0; iaFile[iFile].nSector; i++){
+ u8 *aOld = pDb->aFile[iFile].aSector[i].aOld;
+ if( aOld ){
+ int iOpt = testPrngValue(iSeed++) % 3;
+ switch( iOpt ){
+ case 0:
+ break;
+
+ case 1:
+ testPrngArray(iSeed++, (u32 *)aOld, pDb->szSector/4);
+ /* Fall-through */
+
+ case 2:
+ pEnv->xWrite(
+ pFile, (lsm_i64)i * pDb->szSector, aOld, pDb->szSector
+ );
+ break;
+ }
+ testFree(aOld);
+ pDb->aFile[iFile].aSector[i].aOld = 0;
+ }
+ }
+ pEnv->xClose(pFile);
+ zFree = zFile = sqlite3_mprintf("%s-log", pDb->zName);
+ }
+
+ sqlite3_free(zFree);
+}
+/*
+** End test VFS code.
+**************************************************************************
+*************************************************************************/
+
+/*************************************************************************
+**************************************************************************
+** Begin test compression hooks.
+*/
+
+#ifdef HAVE_ZLIB
+#include
+
+static int testZipBound(void *pCtx, int nSrc){
+ return compressBound(nSrc);
+}
+
+static int testZipCompress(
+ void *pCtx, /* Context pointer */
+ char *aOut, int *pnOut, /* OUT: Buffer containing compressed data */
+ const char *aIn, int nIn /* Buffer containing input data */
+){
+ uLongf n = *pnOut; /* In/out buffer size for compress() */
+ int rc; /* compress() return code */
+
+ rc = compress((Bytef*)aOut, &n, (Bytef*)aIn, nIn);
+ *pnOut = n;
+ return (rc==Z_OK ? 0 : LSM_ERROR);
+}
+
+static int testZipUncompress(
+ void *pCtx, /* Context pointer */
+ char *aOut, int *pnOut, /* OUT: Buffer containing uncompressed data */
+ const char *aIn, int nIn /* Buffer containing input data */
+){
+ uLongf n = *pnOut; /* In/out buffer size for uncompress() */
+ int rc; /* uncompress() return code */
+
+ rc = uncompress((Bytef*)aOut, &n, (Bytef*)aIn, nIn);
+ *pnOut = n;
+ return (rc==Z_OK ? 0 : LSM_ERROR);
+}
+
+static int testConfigureCompression(lsm_db *pDb){
+ static lsm_compress zip = {
+ 0, /* Context pointer (unused) */
+ 1, /* Id value */
+ testZipBound, /* xBound method */
+ testZipCompress, /* xCompress method */
+ testZipUncompress /* xUncompress method */
+ };
+ return lsm_config(pDb, LSM_CONFIG_SET_COMPRESSION, &zip);
+}
+#endif /* ifdef HAVE_ZLIB */
+
+/*
+** End test compression hooks.
+**************************************************************************
+*************************************************************************/
+
+static int test_lsm_close(TestDb *pTestDb){
+ int i;
+ int rc = LSM_OK;
+ LsmDb *pDb = (LsmDb *)pTestDb;
+
+ lsm_csr_close(pDb->pCsr);
+ lsm_close(pDb->db);
+
+ /* If this is a multi-threaded database, wait on the worker threads. */
+ mt_shutdown(pDb);
+ for(i=0; inWorker && rc==LSM_OK; i++){
+ rc = pDb->aWorker[i].worker_rc;
+ }
+
+ for(i=0; iaFile[0].nSector; i++){
+ testFree(pDb->aFile[0].aSector[i].aOld);
+ }
+ testFree(pDb->aFile[0].aSector);
+ for(i=0; iaFile[1].nSector; i++){
+ testFree(pDb->aFile[1].aSector[i].aOld);
+ }
+ testFree(pDb->aFile[1].aSector);
+
+ memset(pDb, sizeof(LsmDb), 0x11);
+ testFree((char *)pDb->pBuf);
+ testFree((char *)pDb);
+ return rc;
+}
+
+static void mt_signal_worker(LsmDb*, int);
+
+static int waitOnCheckpointer(LsmDb *pDb, lsm_db *db){
+ int nSleep = 0;
+ int nKB;
+ int rc;
+
+ do {
+ nKB = 0;
+ rc = lsm_info(db, LSM_INFO_CHECKPOINT_SIZE, &nKB);
+ if( rc!=LSM_OK || nKBnMtMaxCkpt ) break;
+#ifdef LSM_MUTEX_PTHREADS
+ mt_signal_worker(pDb,
+ (pDb->eMode==LSMTEST_MODE_BACKGROUND_CKPT ? 0 : 1)
+ );
+#endif
+ usleep(5000);
+ nSleep += 5;
+ }while( 1 );
+
+#if 0
+ if( nSleep ) printf("# waitOnCheckpointer(): nSleep=%d\n", nSleep);
+#endif
+
+ return rc;
+}
+
+static int waitOnWorker(LsmDb *pDb){
+ int rc;
+ int nLimit = -1;
+ int nSleep = 0;
+
+ rc = lsm_config(pDb->db, LSM_CONFIG_AUTOFLUSH, &nLimit);
+ do {
+ int nOld, nNew, rc2;
+ rc2 = lsm_info(pDb->db, LSM_INFO_TREE_SIZE, &nOld, &nNew);
+ if( rc2!=LSM_OK ) return rc2;
+ if( nOld==0 || nNew<(nLimit/2) ) break;
+#ifdef LSM_MUTEX_PTHREADS
+ mt_signal_worker(pDb, 0);
+#endif
+ usleep(5000);
+ nSleep += 5;
+ }while( 1 );
+
+#if 0
+ if( nSleep ) printf("# waitOnWorker(): nSleep=%d\n", nSleep);
+#endif
+
+ return rc;
+}
+
+static int test_lsm_write(
+ TestDb *pTestDb,
+ void *pKey,
+ int nKey,
+ void *pVal,
+ int nVal
+){
+ LsmDb *pDb = (LsmDb *)pTestDb;
+ int rc = LSM_OK;
+
+ if( pDb->eMode==LSMTEST_MODE_BACKGROUND_CKPT ){
+ rc = waitOnCheckpointer(pDb, pDb->db);
+ }else if(
+ pDb->eMode==LSMTEST_MODE_BACKGROUND_WORK
+ || pDb->eMode==LSMTEST_MODE_BACKGROUND_BOTH
+ ){
+ rc = waitOnWorker(pDb);
+ }
+
+ if( rc==LSM_OK ){
+ rc = lsm_insert(pDb->db, pKey, nKey, pVal, nVal);
+ }
+ return rc;
+}
+
+static int test_lsm_delete(TestDb *pTestDb, void *pKey, int nKey){
+ LsmDb *pDb = (LsmDb *)pTestDb;
+ return lsm_delete(pDb->db, pKey, nKey);
+}
+
+static int test_lsm_delete_range(
+ TestDb *pTestDb,
+ void *pKey1, int nKey1,
+ void *pKey2, int nKey2
+){
+ LsmDb *pDb = (LsmDb *)pTestDb;
+ return lsm_delete_range(pDb->db, pKey1, nKey1, pKey2, nKey2);
+}
+
+static int test_lsm_fetch(
+ TestDb *pTestDb,
+ void *pKey,
+ int nKey,
+ void **ppVal,
+ int *pnVal
+){
+ int rc;
+ LsmDb *pDb = (LsmDb *)pTestDb;
+ lsm_cursor *csr;
+
+ if( pKey==0 ) return LSM_OK;
+
+ rc = lsm_csr_open(pDb->db, &csr);
+ if( rc!=LSM_OK ) return rc;
+
+ rc = lsm_csr_seek(csr, pKey, nKey, LSM_SEEK_EQ);
+ if( rc==LSM_OK ){
+ if( lsm_csr_valid(csr) ){
+ const void *pVal; int nVal;
+ rc = lsm_csr_value(csr, &pVal, &nVal);
+ if( nVal>pDb->nBuf ){
+ testFree(pDb->pBuf);
+ pDb->pBuf = testMalloc(nVal*2);
+ pDb->nBuf = nVal*2;
+ }
+ memcpy(pDb->pBuf, pVal, nVal);
+ *ppVal = pDb->pBuf;
+ *pnVal = nVal;
+ }else{
+ *ppVal = 0;
+ *pnVal = -1;
+ }
+ }
+ lsm_csr_close(csr);
+ return rc;
+}
+
+static int test_lsm_scan(
+ TestDb *pTestDb,
+ void *pCtx,
+ int bReverse,
+ void *pFirst, int nFirst,
+ void *pLast, int nLast,
+ void (*xCallback)(void *, void *, int , void *, int)
+){
+ LsmDb *pDb = (LsmDb *)pTestDb;
+ lsm_cursor *csr;
+ int rc;
+
+ rc = lsm_csr_open(pDb->db, &csr);
+ if( rc!=LSM_OK ) return rc;
+
+ if( bReverse ){
+ if( pLast ){
+ rc = lsm_csr_seek(csr, pLast, nLast, LSM_SEEK_LE);
+ }else{
+ rc = lsm_csr_last(csr);
+ }
+ }else{
+ if( pFirst ){
+ rc = lsm_csr_seek(csr, pFirst, nFirst, LSM_SEEK_GE);
+ }else{
+ rc = lsm_csr_first(csr);
+ }
+ }
+
+ while( rc==LSM_OK && lsm_csr_valid(csr) ){
+ const void *pKey; int nKey;
+ const void *pVal; int nVal;
+ int cmp;
+
+ lsm_csr_key(csr, &pKey, &nKey);
+ lsm_csr_value(csr, &pVal, &nVal);
+
+ if( bReverse && pFirst ){
+ cmp = memcmp(pFirst, pKey, MIN(nKey, nFirst));
+ if( cmp>0 || (cmp==0 && nFirst>nKey) ) break;
+ }else if( bReverse==0 && pLast ){
+ cmp = memcmp(pLast, pKey, MIN(nKey, nLast));
+ if( cmp<0 || (cmp==0 && nLastpCsr==0 ) rc = lsm_csr_open(pDb->db, &pDb->pCsr);
+ if( rc==LSM_OK && iLevel>1 ){
+ rc = lsm_begin(pDb->db, iLevel-1);
+ }
+
+ return rc;
+}
+static int test_lsm_commit(TestDb *pTestDb, int iLevel){
+ LsmDb *pDb = (LsmDb *)pTestDb;
+
+ /* If iLevel==0, close any open read transaction */
+ if( iLevel==0 && pDb->pCsr ){
+ lsm_csr_close(pDb->pCsr);
+ pDb->pCsr = 0;
+ }
+
+ /* If iLevel==0, close any open read transaction */
+ return lsm_commit(pDb->db, MAX(0, iLevel-1));
+}
+static int test_lsm_rollback(TestDb *pTestDb, int iLevel){
+ LsmDb *pDb = (LsmDb *)pTestDb;
+
+ /* If iLevel==0, close any open read transaction */
+ if( iLevel==0 && pDb->pCsr ){
+ lsm_csr_close(pDb->pCsr);
+ pDb->pCsr = 0;
+ }
+
+ return lsm_rollback(pDb->db, MAX(0, iLevel-1));
+}
+
+/*
+** A log message callback registered with lsm connections. Prints all
+** messages to stderr.
+*/
+static void xLog(void *pCtx, int rc, const char *z){
+ unused_parameter(rc);
+ /* fprintf(stderr, "lsm: rc=%d \"%s\"\n", rc, z); */
+ if( pCtx ) fprintf(stderr, "%s: ", (char *)pCtx);
+ fprintf(stderr, "%s\n", z);
+ fflush(stderr);
+}
+
+static void xWorkHook(lsm_db *db, void *pArg){
+ LsmDb *p = (LsmDb *)pArg;
+ if( p->xWork ) p->xWork(db, p->pWorkCtx);
+}
+
+#define TEST_NO_RECOVERY -1
+#define TEST_COMPRESSION -3
+
+#define TEST_MT_MODE -2
+#define TEST_MT_MIN_CKPT -4
+#define TEST_MT_MAX_CKPT -5
+
+int test_lsm_config_str(
+ LsmDb *pLsm,
+ lsm_db *db,
+ int bWorker,
+ const char *zStr,
+ int *pnThread
+){
+ struct CfgParam {
+ const char *zParam;
+ int bWorker;
+ int eParam;
+ } aParam[] = {
+ { "autoflush", 0, LSM_CONFIG_AUTOFLUSH },
+ { "page_size", 0, LSM_CONFIG_PAGE_SIZE },
+ { "block_size", 0, LSM_CONFIG_BLOCK_SIZE },
+ { "safety", 0, LSM_CONFIG_SAFETY },
+ { "autowork", 0, LSM_CONFIG_AUTOWORK },
+ { "autocheckpoint", 0, LSM_CONFIG_AUTOCHECKPOINT },
+ { "mmap", 0, LSM_CONFIG_MMAP },
+ { "use_log", 0, LSM_CONFIG_USE_LOG },
+ { "automerge", 0, LSM_CONFIG_AUTOMERGE },
+ { "max_freelist", 0, LSM_CONFIG_MAX_FREELIST },
+ { "multi_proc", 0, LSM_CONFIG_MULTIPLE_PROCESSES },
+ { "worker_automerge", 1, LSM_CONFIG_AUTOMERGE },
+ { "test_no_recovery", 0, TEST_NO_RECOVERY },
+ { "bg_min_ckpt", 0, TEST_NO_RECOVERY },
+
+ { "mt_mode", 0, TEST_MT_MODE },
+ { "mt_min_ckpt", 0, TEST_MT_MIN_CKPT },
+ { "mt_max_ckpt", 0, TEST_MT_MAX_CKPT },
+
+#ifdef HAVE_ZLIB
+ { "compression", 0, TEST_COMPRESSION },
+#endif
+ { 0, 0 }
+ };
+ const char *z = zStr;
+ int nThread = 1;
+
+ if( zStr==0 ) return 0;
+
+ assert( db );
+ while( z[0] ){
+ const char *zStart;
+
+ /* Skip whitespace */
+ while( *z==' ' ) z++;
+ zStart = z;
+
+ while( *z && *z!='=' ) z++;
+ if( *z ){
+ int eParam;
+ int i;
+ int iVal;
+ int iMul = 1;
+ int rc;
+ char zParam[32];
+ int nParam = z-zStart;
+ if( nParam==0 || nParam>sizeof(zParam)-1 ) goto syntax_error;
+
+ memcpy(zParam, zStart, nParam);
+ zParam[nParam] = '\0';
+ rc = testArgSelect(aParam, "param", zParam, &i);
+ if( rc!=0 ) return rc;
+ eParam = aParam[i].eParam;
+
+ z++;
+ zStart = z;
+ while( *z>='0' && *z<='9' ) z++;
+ if( *z=='k' || *z=='K' ){
+ iMul = 1;
+ z++;
+ }else if( *z=='M' || *z=='M' ){
+ iMul = 1024;
+ z++;
+ }
+ nParam = z-zStart;
+ if( nParam==0 || nParam>sizeof(zParam)-1 ) goto syntax_error;
+ memcpy(zParam, zStart, nParam);
+ zParam[nParam] = '\0';
+ iVal = atoi(zParam) * iMul;
+
+ if( eParam>0 ){
+ if( bWorker || aParam[i].bWorker==0 ){
+ lsm_config(db, eParam, &iVal);
+ }
+ }else{
+ switch( eParam ){
+ case TEST_NO_RECOVERY:
+ if( pLsm ) pLsm->bNoRecovery = iVal;
+ break;
+ case TEST_MT_MODE:
+ if( pLsm ) nThread = iVal;
+ break;
+ case TEST_MT_MIN_CKPT:
+ if( pLsm && iVal>0 ) pLsm->nMtMinCkpt = iVal*1024;
+ break;
+ case TEST_MT_MAX_CKPT:
+ if( pLsm && iVal>0 ) pLsm->nMtMaxCkpt = iVal*1024;
+ break;
+#ifdef HAVE_ZLIB
+ case TEST_COMPRESSION:
+ testConfigureCompression(db);
+ break;
+#endif
+ }
+ }
+ }else if( z!=zStart ){
+ goto syntax_error;
+ }
+ }
+
+ if( pnThread ) *pnThread = nThread;
+ if( pLsm && pLsm->nMtMaxCkpt < pLsm->nMtMinCkpt ){
+ pLsm->nMtMinCkpt = pLsm->nMtMaxCkpt;
+ }
+
+ return 0;
+ syntax_error:
+ testPrintError("syntax error at: \"%s\"\n", z);
+ return 1;
+}
+
+int tdb_lsm_config_str(TestDb *pDb, const char *zStr){
+ int rc = 0;
+ if( tdb_lsm(pDb) ){
+#ifdef LSM_MUTEX_PTHREADS
+ int i;
+#endif
+ LsmDb *pLsm = (LsmDb *)pDb;
+
+ rc = test_lsm_config_str(pLsm, pLsm->db, 0, zStr, 0);
+#ifdef LSM_MUTEX_PTHREADS
+ for(i=0; rc==0 && inWorker; i++){
+ rc = test_lsm_config_str(0, pLsm->aWorker[i].pWorker, 1, zStr, 0);
+ }
+#endif
+ }
+ return rc;
+}
+
+int tdb_lsm_configure(lsm_db *db, const char *zConfig){
+ return test_lsm_config_str(0, db, 0, zConfig, 0);
+}
+
+static int testLsmStartWorkers(LsmDb *, int, const char *, const char *);
+
+static int testLsmOpen(
+ const char *zCfg,
+ const char *zFilename,
+ int bClear,
+ TestDb **ppDb
+){
+ static const DatabaseMethods LsmMethods = {
+ test_lsm_close,
+ test_lsm_write,
+ test_lsm_delete,
+ test_lsm_delete_range,
+ test_lsm_fetch,
+ test_lsm_scan,
+ test_lsm_begin,
+ test_lsm_commit,
+ test_lsm_rollback
+ };
+
+ int rc;
+ int nFilename;
+ LsmDb *pDb;
+
+ /* If the bClear flag is set, delete any existing database. */
+ assert( zFilename);
+ if( bClear ) testDeleteLsmdb(zFilename);
+ nFilename = strlen(zFilename);
+
+ pDb = (LsmDb *)testMalloc(sizeof(LsmDb) + nFilename + 1);
+ memset(pDb, 0, sizeof(LsmDb));
+ pDb->base.pMethods = &LsmMethods;
+ pDb->zName = (char *)&pDb[1];
+ memcpy(pDb->zName, zFilename, nFilename + 1);
+
+ /* Default the sector size used for crash simulation to 512 bytes.
+ ** Todo: There should be an OS method to obtain this value - just as
+ ** there is in SQLite. For now, LSM assumes that it is smaller than
+ ** the page size (default 4KB).
+ */
+ pDb->szSector = 256;
+
+ /* Default values for the mt_min_ckpt and mt_max_ckpt parameters. */
+ pDb->nMtMinCkpt = LSMTEST_DFLT_MT_MIN_CKPT;
+ pDb->nMtMaxCkpt = LSMTEST_DFLT_MT_MAX_CKPT;
+
+ memcpy(&pDb->env, tdb_lsm_env(), sizeof(lsm_env));
+ pDb->env.pVfsCtx = (void *)pDb;
+ pDb->env.xFullpath = testEnvFullpath;
+ pDb->env.xOpen = testEnvOpen;
+ pDb->env.xRead = testEnvRead;
+ pDb->env.xWrite = testEnvWrite;
+ pDb->env.xTruncate = testEnvTruncate;
+ pDb->env.xSync = testEnvSync;
+ pDb->env.xSectorSize = testEnvSectorSize;
+ pDb->env.xRemap = testEnvRemap;
+ pDb->env.xFileid = testEnvFileid;
+ pDb->env.xClose = testEnvClose;
+ pDb->env.xUnlink = testEnvUnlink;
+ pDb->env.xLock = testEnvLock;
+ pDb->env.xTestLock = testEnvTestLock;
+ pDb->env.xShmBarrier = testEnvShmBarrier;
+ pDb->env.xShmMap = testEnvShmMap;
+ pDb->env.xShmUnmap = testEnvShmUnmap;
+ pDb->env.xSleep = testEnvSleep;
+
+ rc = lsm_new(&pDb->env, &pDb->db);
+ if( rc==LSM_OK ){
+ int nThread = 1;
+ lsm_config_log(pDb->db, xLog, 0);
+ lsm_config_work_hook(pDb->db, xWorkHook, (void *)pDb);
+
+ rc = test_lsm_config_str(pDb, pDb->db, 0, zCfg, &nThread);
+ if( rc==LSM_OK ) rc = lsm_open(pDb->db, zFilename);
+
+ pDb->eMode = nThread;
+#ifdef LSM_MUTEX_PTHREADS
+ if( rc==LSM_OK && nThread>1 ){
+ testLsmStartWorkers(pDb, nThread, zFilename, zCfg);
+ }
+#endif
+
+ if( rc!=LSM_OK ){
+ test_lsm_close((TestDb *)pDb);
+ pDb = 0;
+ }
+ }
+
+ *ppDb = (TestDb *)pDb;
+ return rc;
+}
+
+int test_lsm_open(
+ const char *zSpec,
+ const char *zFilename,
+ int bClear,
+ TestDb **ppDb
+){
+ return testLsmOpen(zSpec, zFilename, bClear, ppDb);
+}
+
+int test_lsm_small_open(
+ const char *zSpec,
+ const char *zFile,
+ int bClear,
+ TestDb **ppDb
+){
+ const char *zCfg = "page_size=256 block_size=64 mmap=1024";
+ return testLsmOpen(zCfg, zFile, bClear, ppDb);
+}
+
+int test_lsm_lomem_open(
+ const char *zSpec,
+ const char *zFilename,
+ int bClear,
+ TestDb **ppDb
+){
+ /* "max_freelist=4 autocheckpoint=32" */
+ const char *zCfg =
+ "page_size=256 block_size=64 autoflush=16 "
+ "autocheckpoint=32"
+ "mmap=0 "
+ ;
+ return testLsmOpen(zCfg, zFilename, bClear, ppDb);
+}
+
+int test_lsm_zip_open(
+ const char *zSpec,
+ const char *zFilename,
+ int bClear,
+ TestDb **ppDb
+){
+ const char *zCfg =
+ "page_size=256 block_size=64 autoflush=16 "
+ "autocheckpoint=32 compression=1 mmap=0 "
+ ;
+ return testLsmOpen(zCfg, zFilename, bClear, ppDb);
+}
+
+lsm_db *tdb_lsm(TestDb *pDb){
+ if( pDb->pMethods->xClose==test_lsm_close ){
+ return ((LsmDb *)pDb)->db;
+ }
+ return 0;
+}
+
+int tdb_lsm_multithread(TestDb *pDb){
+ int ret = 0;
+ if( tdb_lsm(pDb) ){
+ ret = ((LsmDb*)pDb)->eMode!=LSMTEST_MODE_SINGLETHREAD;
+ }
+ return ret;
+}
+
+void tdb_lsm_enable_log(TestDb *pDb, int bEnable){
+ lsm_db *db = tdb_lsm(pDb);
+ if( db ){
+ lsm_config_log(db, (bEnable ? xLog : 0), (void *)"client");
+ }
+}
+
+void tdb_lsm_application_crash(TestDb *pDb){
+ if( tdb_lsm(pDb) ){
+ LsmDb *p = (LsmDb *)pDb;
+ p->bCrashed = 1;
+ }
+}
+
+void tdb_lsm_prepare_system_crash(TestDb *pDb){
+ if( tdb_lsm(pDb) ){
+ LsmDb *p = (LsmDb *)pDb;
+ p->bPrepareCrash = 1;
+ }
+}
+
+void tdb_lsm_system_crash(TestDb *pDb){
+ if( tdb_lsm(pDb) ){
+ LsmDb *p = (LsmDb *)pDb;
+ p->bCrashed = 1;
+ doSystemCrash(p);
+ }
+}
+
+void tdb_lsm_safety(TestDb *pDb, int eMode){
+ assert( eMode==LSM_SAFETY_OFF
+ || eMode==LSM_SAFETY_NORMAL
+ || eMode==LSM_SAFETY_FULL
+ );
+ if( tdb_lsm(pDb) ){
+ int iParam = eMode;
+ LsmDb *p = (LsmDb *)pDb;
+ lsm_config(p->db, LSM_CONFIG_SAFETY, &iParam);
+ }
+}
+
+void tdb_lsm_prepare_sync_crash(TestDb *pDb, int iSync){
+ assert( iSync>0 );
+ if( tdb_lsm(pDb) ){
+ LsmDb *p = (LsmDb *)pDb;
+ p->nAutoCrash = iSync;
+ p->bPrepareCrash = 1;
+ }
+}
+
+void tdb_lsm_config_work_hook(
+ TestDb *pDb,
+ void (*xWork)(lsm_db *, void *),
+ void *pWorkCtx
+){
+ if( tdb_lsm(pDb) ){
+ LsmDb *p = (LsmDb *)pDb;
+ p->xWork = xWork;
+ p->pWorkCtx = pWorkCtx;
+ }
+}
+
+void tdb_lsm_write_hook(
+ TestDb *pDb,
+ void (*xWrite)(void *, int, lsm_i64, int, int),
+ void *pWriteCtx
+){
+ if( tdb_lsm(pDb) ){
+ LsmDb *p = (LsmDb *)pDb;
+ p->xWriteHook = xWrite;
+ p->pWriteCtx = pWriteCtx;
+ }
+}
+
+int tdb_lsm_open(const char *zCfg, const char *zDb, int bClear, TestDb **ppDb){
+ return testLsmOpen(zCfg, zDb, bClear, ppDb);
+}
+
+#ifdef LSM_MUTEX_PTHREADS
+
+/*
+** Signal worker thread iWorker that there may be work to do.
+*/
+static void mt_signal_worker(LsmDb *pDb, int iWorker){
+ LsmWorker *p = &pDb->aWorker[iWorker];
+ pthread_mutex_lock(&p->worker_mutex);
+ p->bDoWork = 1;
+ pthread_cond_signal(&p->worker_cond);
+ pthread_mutex_unlock(&p->worker_mutex);
+}
+
+/*
+** This routine is used as the main() for all worker threads.
+*/
+static void *worker_main(void *pArg){
+ LsmWorker *p = (LsmWorker *)pArg;
+ lsm_db *pWorker; /* Connection to access db through */
+
+ pthread_mutex_lock(&p->worker_mutex);
+ while( (pWorker = p->pWorker) ){
+ int rc = LSM_OK;
+
+ /* Do some work. If an error occurs, exit. */
+
+ pthread_mutex_unlock(&p->worker_mutex);
+ if( p->eType==LSMTEST_THREAD_CKPT ){
+ int nKB = 0;
+ rc = lsm_info(pWorker, LSM_INFO_CHECKPOINT_SIZE, &nKB);
+ if( rc==LSM_OK && nKB>=p->pDb->nMtMinCkpt ){
+ rc = lsm_checkpoint(pWorker, 0);
+ }
+ }else{
+ int nWrite;
+ do {
+
+ if( p->eType==LSMTEST_THREAD_WORKER ){
+ waitOnCheckpointer(p->pDb, pWorker);
+ }
+
+ nWrite = 0;
+ rc = lsm_work(pWorker, 0, 256, &nWrite);
+
+ if( p->eType==LSMTEST_THREAD_WORKER && nWrite ){
+ mt_signal_worker(p->pDb, 1);
+ }
+ }while( nWrite && p->pWorker );
+ }
+ pthread_mutex_lock(&p->worker_mutex);
+
+ if( rc!=LSM_OK && rc!=LSM_BUSY ){
+ p->worker_rc = rc;
+ break;
+ }
+
+ /* The thread will wake up when it is signaled either because another
+ ** thread has created some work for this one or because the connection
+ ** is being closed. */
+ if( p->pWorker && p->bDoWork==0 ){
+ pthread_cond_wait(&p->worker_cond, &p->worker_mutex);
+ }
+ p->bDoWork = 0;
+ }
+ pthread_mutex_unlock(&p->worker_mutex);
+
+ return 0;
+}
+
+
+static void mt_stop_worker(LsmDb *pDb, int iWorker){
+ LsmWorker *p = &pDb->aWorker[iWorker];
+ if( p->pWorker ){
+ void *pDummy;
+ lsm_db *pWorker;
+
+ /* Signal the worker to stop */
+ pthread_mutex_lock(&p->worker_mutex);
+ pWorker = p->pWorker;
+ p->pWorker = 0;
+ pthread_cond_signal(&p->worker_cond);
+ pthread_mutex_unlock(&p->worker_mutex);
+
+ /* Join the worker thread. */
+ pthread_join(p->worker_thread, &pDummy);
+
+ /* Free resources allocated in mt_start_worker() */
+ pthread_cond_destroy(&p->worker_cond);
+ pthread_mutex_destroy(&p->worker_mutex);
+ lsm_close(pWorker);
+ }
+}
+
+static void mt_shutdown(LsmDb *pDb){
+ int i;
+ for(i=0; inWorker; i++){
+ mt_stop_worker(pDb, i);
+ }
+}
+
+/*
+** This callback is invoked by LSM when the client database writes to
+** the database file (i.e. to flush the contents of the in-memory tree).
+** This implies there may be work to do on the database, so signal
+** the worker threads.
+*/
+static void mt_client_work_hook(lsm_db *db, void *pArg){
+ LsmDb *pDb = (LsmDb *)pArg; /* LsmDb database handle */
+
+ /* Invoke the user level work-hook, if any. */
+ if( pDb->xWork ) pDb->xWork(db, pDb->pWorkCtx);
+
+ /* Wake up worker thread 0. */
+ mt_signal_worker(pDb, 0);
+}
+
+static void mt_worker_work_hook(lsm_db *db, void *pArg){
+ LsmDb *pDb = (LsmDb *)pArg; /* LsmDb database handle */
+
+ /* Invoke the user level work-hook, if any. */
+ if( pDb->xWork ) pDb->xWork(db, pDb->pWorkCtx);
+}
+
+/*
+** Launch worker thread iWorker for database connection pDb.
+*/
+static int mt_start_worker(
+ LsmDb *pDb, /* Main database structure */
+ int iWorker, /* Worker number to start */
+ const char *zFilename, /* File name of database to open */
+ const char *zCfg, /* Connection configuration string */
+ int eType /* Type of worker thread */
+){
+ int rc = 0; /* Return code */
+ LsmWorker *p; /* Object to initialize */
+
+ assert( iWorkernWorker );
+ assert( eType==LSMTEST_THREAD_CKPT
+ || eType==LSMTEST_THREAD_WORKER
+ || eType==LSMTEST_THREAD_WORKER_AC
+ );
+
+ p = &pDb->aWorker[iWorker];
+ p->eType = eType;
+ p->pDb = pDb;
+
+ /* Open the worker connection */
+ if( rc==0 ) rc = lsm_new(&pDb->env, &p->pWorker);
+ if( zCfg ){
+ test_lsm_config_str(pDb, p->pWorker, 1, zCfg, 0);
+ }
+ if( rc==0 ) rc = lsm_open(p->pWorker, zFilename);
+ lsm_config_log(p->pWorker, xLog, (void *)"worker");
+
+ /* Configure the work-hook */
+ if( rc==0 ){
+ lsm_config_work_hook(p->pWorker, mt_worker_work_hook, (void *)pDb);
+ }
+
+ if( eType==LSMTEST_THREAD_WORKER ){
+ test_lsm_config_str(0, p->pWorker, 1, "autocheckpoint=0", 0);
+ }
+
+ /* Kick off the worker thread. */
+ if( rc==0 ) rc = pthread_cond_init(&p->worker_cond, 0);
+ if( rc==0 ) rc = pthread_mutex_init(&p->worker_mutex, 0);
+ if( rc==0 ) rc = pthread_create(&p->worker_thread, 0, worker_main, (void *)p);
+
+ return rc;
+}
+
+
+static int testLsmStartWorkers(
+ LsmDb *pDb, int eModel, const char *zFilename, const char *zCfg
+){
+ int rc;
+
+ if( eModel<1 || eModel>4 ) return 1;
+ if( eModel==1 ) return 0;
+
+ /* Configure a work-hook for the client connection. Worker 0 is signalled
+ ** every time the users connection writes to the database. */
+ lsm_config_work_hook(pDb->db, mt_client_work_hook, (void *)pDb);
+
+ /* Allocate space for two worker connections. They may not both be
+ ** used, but both are allocated. */
+ pDb->aWorker = (LsmWorker *)testMalloc(sizeof(LsmWorker) * 2);
+ memset(pDb->aWorker, 0, sizeof(LsmWorker) * 2);
+
+ switch( eModel ){
+ case LSMTEST_MODE_BACKGROUND_CKPT:
+ pDb->nWorker = 1;
+ test_lsm_config_str(0, pDb->db, 0, "autocheckpoint=0", 0);
+ rc = mt_start_worker(pDb, 0, zFilename, zCfg, LSMTEST_THREAD_CKPT);
+ break;
+
+ case LSMTEST_MODE_BACKGROUND_WORK:
+ pDb->nWorker = 1;
+ test_lsm_config_str(0, pDb->db, 0, "autowork=0", 0);
+ rc = mt_start_worker(pDb, 0, zFilename, zCfg, LSMTEST_THREAD_WORKER_AC);
+ break;
+
+ case LSMTEST_MODE_BACKGROUND_BOTH:
+ pDb->nWorker = 2;
+ test_lsm_config_str(0, pDb->db, 0, "autowork=0", 0);
+ rc = mt_start_worker(pDb, 0, zFilename, zCfg, LSMTEST_THREAD_WORKER);
+ if( rc==0 ){
+ rc = mt_start_worker(pDb, 1, zFilename, zCfg, LSMTEST_THREAD_CKPT);
+ }
+ break;
+ }
+
+ return rc;
+}
+
+
+int test_lsm_mt2(
+ const char *zSpec,
+ const char *zFilename,
+ int bClear,
+ TestDb **ppDb
+){
+ const char *zCfg = "mt_mode=2";
+ return testLsmOpen(zCfg, zFilename, bClear, ppDb);
+}
+
+int test_lsm_mt3(
+ const char *zSpec,
+ const char *zFilename,
+ int bClear,
+ TestDb **ppDb
+){
+ const char *zCfg = "mt_mode=4";
+ return testLsmOpen(zCfg, zFilename, bClear, ppDb);
+}
+
+#else
+static void mt_shutdown(LsmDb *pDb) {
+ unused_parameter(pDb);
+}
+int test_lsm_mt(const char *zFilename, int bClear, TestDb **ppDb){
+ unused_parameter(zFilename);
+ unused_parameter(bClear);
+ unused_parameter(ppDb);
+ testPrintError("threads unavailable - recompile with LSM_MUTEX_PTHREADS\n");
+ return 1;
+}
+#endif
diff --git a/ext/lsm1/lsm-test/lsmtest_tdb4.c b/ext/lsm1/lsm-test/lsmtest_tdb4.c
new file mode 100644
index 0000000..c45b052
--- /dev/null
+++ b/ext/lsm1/lsm-test/lsmtest_tdb4.c
@@ -0,0 +1,982 @@
+
+/*
+** This file contains the TestDb bt wrapper.
+*/
+
+#include "lsmtest_tdb.h"
+#include "lsmtest.h"
+#include
+#include "bt.h"
+
+#include
+
+typedef struct BtDb BtDb;
+typedef struct BtFile BtFile;
+
+/* Background checkpointer interface (see implementations below). */
+typedef struct bt_ckpter bt_ckpter;
+static int bgc_attach(BtDb *pDb, const char*);
+static int bgc_detach(BtDb *pDb);
+
+/*
+** Each database or log file opened by a database handle is wrapped by
+** an object of the following type.
+*/
+struct BtFile {
+ BtDb *pBt; /* Database handle that opened this file */
+ bt_env *pVfs; /* Underlying VFS */
+ bt_file *pFile; /* File handle belonging to underlying VFS */
+ int nSectorSize; /* Size of sectors in bytes */
+ int nSector; /* Allocated size of nSector array */
+ u8 **apSector; /* Original sector data */
+};
+
+/*
+** nCrashSync:
+** If this value is non-zero, then a "crash-test" is running. If
+** nCrashSync==1, then the crash is simulated during the very next
+** call to the xSync() VFS method (on either the db or log file).
+** If nCrashSync==2, the following call to xSync(), and so on.
+**
+** bCrash:
+** After a crash is simulated, this variable is set. Any subsequent
+** attempts to write to a file or modify the file system in any way
+** fail once this is set. All the caller can do is close the connection.
+**
+** bFastInsert:
+** If this variable is set to true, then a BT_CONTROL_FAST_INSERT_OP
+** control is issued before each callto BtReplace() or BtCsrOpen().
+*/
+struct BtDb {
+ TestDb base; /* Base class */
+ bt_db *pBt; /* bt database handle */
+ sqlite4_env *pEnv; /* SQLite environment (for malloc/free) */
+ bt_env *pVfs; /* Underlying VFS */
+ int bFastInsert; /* True to use fast-insert */
+
+ /* Space for bt_fetch() results */
+ u8 *aBuffer; /* Space to store results */
+ int nBuffer; /* Allocated size of aBuffer[] in bytes */
+ int nRef;
+
+ /* Background checkpointer used by mt connections */
+ bt_ckpter *pCkpter;
+
+ /* Stuff used for crash test simulation */
+ BtFile *apFile[2]; /* Database and log files used by pBt */
+ bt_env env; /* Private VFS for this object */
+ int nCrashSync; /* Number of syncs until crash (see above) */
+ int bCrash; /* True once a crash has been simulated */
+};
+
+static int btVfsFullpath(
+ sqlite4_env *pEnv,
+ bt_env *pVfs,
+ const char *z,
+ char **pzOut
+){
+ BtDb *pBt = (BtDb*)pVfs->pVfsCtx;
+ if( pBt->bCrash ) return SQLITE4_IOERR;
+ return pBt->pVfs->xFullpath(pEnv, pBt->pVfs, z, pzOut);
+}
+
+static int btVfsOpen(
+ sqlite4_env *pEnv,
+ bt_env *pVfs,
+ const char *zFile,
+ int flags, bt_file **ppFile
+){
+ BtFile *p;
+ BtDb *pBt = (BtDb*)pVfs->pVfsCtx;
+ int rc;
+
+ if( pBt->bCrash ) return SQLITE4_IOERR;
+
+ p = (BtFile*)testMalloc(sizeof(BtFile));
+ if( !p ) return SQLITE4_NOMEM;
+ if( flags & BT_OPEN_DATABASE ){
+ pBt->apFile[0] = p;
+ }else if( flags & BT_OPEN_LOG ){
+ pBt->apFile[1] = p;
+ }
+ if( (flags & BT_OPEN_SHARED)==0 ){
+ p->pBt = pBt;
+ }
+ p->pVfs = pBt->pVfs;
+
+ rc = pBt->pVfs->xOpen(pEnv, pVfs, zFile, flags, &p->pFile);
+ if( rc!=SQLITE4_OK ){
+ testFree(p);
+ p = 0;
+ }else{
+ pBt->nRef++;
+ }
+
+ *ppFile = (bt_file*)p;
+ return rc;
+}
+
+static int btVfsSize(bt_file *pFile, sqlite4_int64 *piRes){
+ BtFile *p = (BtFile*)pFile;
+ if( p->pBt && p->pBt->bCrash ) return SQLITE4_IOERR;
+ return p->pVfs->xSize(p->pFile, piRes);
+}
+
+static int btVfsRead(bt_file *pFile, sqlite4_int64 iOff, void *pBuf, int nBuf){
+ BtFile *p = (BtFile*)pFile;
+ if( p->pBt && p->pBt->bCrash ) return SQLITE4_IOERR;
+ return p->pVfs->xRead(p->pFile, iOff, pBuf, nBuf);
+}
+
+static int btFlushSectors(BtFile *p, int iFile){
+ sqlite4_int64 iSz;
+ int rc;
+ int i;
+ u8 *aTmp = 0;
+
+ rc = p->pBt->pVfs->xSize(p->pFile, &iSz);
+ for(i=0; rc==SQLITE4_OK && inSector; i++){
+ if( p->pBt->bCrash && p->apSector[i] ){
+
+ /* The system is simulating a crash. There are three choices for
+ ** this sector:
+ **
+ ** 1) Leave it as it is (simulating a successful write),
+ ** 2) Restore the original data (simulating a lost write),
+ ** 3) Populate the disk sector with garbage data.
+ */
+ sqlite4_int64 iSOff = p->nSectorSize*i;
+ int nWrite = MIN(p->nSectorSize, iSz - iSOff);
+
+ if( nWrite ){
+ u8 *aWrite = 0;
+ int iOpt = (testPrngValue(i) % 3) + 1;
+ if( iOpt==1 ){
+ aWrite = p->apSector[i];
+ }else if( iOpt==3 ){
+ if( aTmp==0 ) aTmp = testMalloc(p->nSectorSize);
+ aWrite = aTmp;
+ testPrngArray(i*13, (u32*)aWrite, nWrite/sizeof(u32));
+ }
+
+#if 0
+fprintf(stderr, "handle sector %d of %s with %s\n", i,
+ iFile==0 ? "db" : "log",
+ iOpt==1 ? "rollback" : iOpt==2 ? "write" : "omit"
+);
+fflush(stderr);
+#endif
+
+ if( aWrite ){
+ rc = p->pBt->pVfs->xWrite(p->pFile, iSOff, aWrite, nWrite);
+ }
+ }
+ }
+ testFree(p->apSector[i]);
+ p->apSector[i] = 0;
+ }
+
+ testFree(aTmp);
+ return rc;
+}
+
+static int btSaveSectors(BtFile *p, sqlite4_int64 iOff, int nBuf){
+ int rc;
+ sqlite4_int64 iSz; /* Size of file on disk */
+ int iFirst; /* First sector affected */
+ int iSector; /* Current sector */
+ int iLast; /* Last sector affected */
+
+ if( p->nSectorSize==0 ){
+ p->nSectorSize = p->pBt->pVfs->xSectorSize(p->pFile);
+ if( p->nSectorSize<512 ) p->nSectorSize = 512;
+ }
+ iLast = (iOff+nBuf-1) / p->nSectorSize;
+ iFirst = iOff / p->nSectorSize;
+
+ rc = p->pBt->pVfs->xSize(p->pFile, &iSz);
+ for(iSector=iFirst; rc==SQLITE4_OK && iSector<=iLast; iSector++){
+ int nRead;
+ sqlite4_int64 iSOff = iSector * p->nSectorSize;
+ u8 *aBuf = testMalloc(p->nSectorSize);
+ nRead = MIN(p->nSectorSize, (iSz - iSOff));
+ if( nRead>0 ){
+ rc = p->pBt->pVfs->xRead(p->pFile, iSOff, aBuf, nRead);
+ }
+
+ while( rc==SQLITE4_OK && iSector>=p->nSector ){
+ int nNew = p->nSector + 32;
+ u8 **apNew = (u8**)testMalloc(nNew * sizeof(u8*));
+ memcpy(apNew, p->apSector, p->nSector*sizeof(u8*));
+ testFree(p->apSector);
+ p->apSector = apNew;
+ p->nSector = nNew;
+ }
+
+ p->apSector[iSector] = aBuf;
+ }
+
+ return rc;
+}
+
+static int btVfsWrite(bt_file *pFile, sqlite4_int64 iOff, void *pBuf, int nBuf){
+ BtFile *p = (BtFile*)pFile;
+ if( p->pBt && p->pBt->bCrash ) return SQLITE4_IOERR;
+ if( p->pBt && p->pBt->nCrashSync ){
+ btSaveSectors(p, iOff, nBuf);
+ }
+ return p->pVfs->xWrite(p->pFile, iOff, pBuf, nBuf);
+}
+
+static int btVfsTruncate(bt_file *pFile, sqlite4_int64 iOff){
+ BtFile *p = (BtFile*)pFile;
+ if( p->pBt && p->pBt->bCrash ) return SQLITE4_IOERR;
+ return p->pVfs->xTruncate(p->pFile, iOff);
+}
+
+static int btVfsSync(bt_file *pFile){
+ int rc = SQLITE4_OK;
+ BtFile *p = (BtFile*)pFile;
+ BtDb *pBt = p->pBt;
+
+ if( pBt ){
+ if( pBt->bCrash ) return SQLITE4_IOERR;
+ if( pBt->nCrashSync ){
+ pBt->nCrashSync--;
+ pBt->bCrash = (pBt->nCrashSync==0);
+ if( pBt->bCrash ){
+ btFlushSectors(pBt->apFile[0], 0);
+ btFlushSectors(pBt->apFile[1], 1);
+ rc = SQLITE4_IOERR;
+ }else{
+ btFlushSectors(p, 0);
+ }
+ }
+ }
+
+ if( rc==SQLITE4_OK ){
+ rc = p->pVfs->xSync(p->pFile);
+ }
+ return rc;
+}
+
+static int btVfsSectorSize(bt_file *pFile){
+ BtFile *p = (BtFile*)pFile;
+ return p->pVfs->xSectorSize(p->pFile);
+}
+
+static void btDeref(BtDb *p){
+ p->nRef--;
+ assert( p->nRef>=0 );
+ if( p->nRef<=0 ) testFree(p);
+}
+
+static int btVfsClose(bt_file *pFile){
+ BtFile *p = (BtFile*)pFile;
+ BtDb *pBt = p->pBt;
+ int rc;
+ if( pBt ){
+ btFlushSectors(p, 0);
+ if( p==pBt->apFile[0] ) pBt->apFile[0] = 0;
+ if( p==pBt->apFile[1] ) pBt->apFile[1] = 0;
+ }
+ testFree(p->apSector);
+ rc = p->pVfs->xClose(p->pFile);
+#if 0
+ btDeref(p->pBt);
+#endif
+ testFree(p);
+ return rc;
+}
+
+static int btVfsUnlink(sqlite4_env *pEnv, bt_env *pVfs, const char *zFile){
+ BtDb *pBt = (BtDb*)pVfs->pVfsCtx;
+ if( pBt->bCrash ) return SQLITE4_IOERR;
+ return pBt->pVfs->xUnlink(pEnv, pBt->pVfs, zFile);
+}
+
+static int btVfsLock(bt_file *pFile, int iLock, int eType){
+ BtFile *p = (BtFile*)pFile;
+ if( p->pBt && p->pBt->bCrash ) return SQLITE4_IOERR;
+ return p->pVfs->xLock(p->pFile, iLock, eType);
+}
+
+static int btVfsTestLock(bt_file *pFile, int iLock, int nLock, int eType){
+ BtFile *p = (BtFile*)pFile;
+ if( p->pBt && p->pBt->bCrash ) return SQLITE4_IOERR;
+ return p->pVfs->xTestLock(p->pFile, iLock, nLock, eType);
+}
+
+static int btVfsShmMap(bt_file *pFile, int iChunk, int sz, void **ppOut){
+ BtFile *p = (BtFile*)pFile;
+ if( p->pBt && p->pBt->bCrash ) return SQLITE4_IOERR;
+ return p->pVfs->xShmMap(p->pFile, iChunk, sz, ppOut);
+}
+
+static void btVfsShmBarrier(bt_file *pFile){
+ BtFile *p = (BtFile*)pFile;
+ return p->pVfs->xShmBarrier(p->pFile);
+}
+
+static int btVfsShmUnmap(bt_file *pFile, int bDelete){
+ BtFile *p = (BtFile*)pFile;
+ if( p->pBt && p->pBt->bCrash ) return SQLITE4_IOERR;
+ return p->pVfs->xShmUnmap(p->pFile, bDelete);
+}
+
+static int bt_close(TestDb *pTestDb){
+ BtDb *p = (BtDb*)pTestDb;
+ int rc = sqlite4BtClose(p->pBt);
+ free(p->aBuffer);
+ if( p->apFile[0] ) p->apFile[0]->pBt = 0;
+ if( p->apFile[1] ) p->apFile[1]->pBt = 0;
+ bgc_detach(p);
+ testFree(p);
+ return rc;
+}
+
+static int btMinTransaction(BtDb *p, int iMin, int *piLevel){
+ int iLevel;
+ int rc = SQLITE4_OK;
+
+ iLevel = sqlite4BtTransactionLevel(p->pBt);
+ if( iLevelpBt, iMin);
+ *piLevel = iLevel;
+ }else{
+ *piLevel = -1;
+ }
+
+ return rc;
+}
+static int btRestoreTransaction(BtDb *p, int iLevel, int rcin){
+ int rc = rcin;
+ if( iLevel>=0 ){
+ if( rc==SQLITE4_OK ){
+ rc = sqlite4BtCommit(p->pBt, iLevel);
+ }else{
+ sqlite4BtRollback(p->pBt, iLevel);
+ }
+ assert( iLevel==sqlite4BtTransactionLevel(p->pBt) );
+ }
+ return rc;
+}
+
+static int bt_write(TestDb *pTestDb, void *pK, int nK, void *pV, int nV){
+ BtDb *p = (BtDb*)pTestDb;
+ int iLevel;
+ int rc;
+
+ rc = btMinTransaction(p, 2, &iLevel);
+ if( rc==SQLITE4_OK ){
+ if( p->bFastInsert ) sqlite4BtControl(p->pBt, BT_CONTROL_FAST_INSERT_OP, 0);
+ rc = sqlite4BtReplace(p->pBt, pK, nK, pV, nV);
+ rc = btRestoreTransaction(p, iLevel, rc);
+ }
+ return rc;
+}
+
+static int bt_delete(TestDb *pTestDb, void *pK, int nK){
+ return bt_write(pTestDb, pK, nK, 0, -1);
+}
+
+static int bt_delete_range(
+ TestDb *pTestDb,
+ void *pKey1, int nKey1,
+ void *pKey2, int nKey2
+){
+ BtDb *p = (BtDb*)pTestDb;
+ bt_cursor *pCsr = 0;
+ int rc = SQLITE4_OK;
+ int iLevel;
+
+ rc = btMinTransaction(p, 2, &iLevel);
+ if( rc==SQLITE4_OK ){
+ if( p->bFastInsert ) sqlite4BtControl(p->pBt, BT_CONTROL_FAST_INSERT_OP, 0);
+ rc = sqlite4BtCsrOpen(p->pBt, 0, &pCsr);
+ }
+ while( rc==SQLITE4_OK ){
+ const void *pK;
+ int n;
+ int nCmp;
+ int res;
+
+ rc = sqlite4BtCsrSeek(pCsr, pKey1, nKey1, BT_SEEK_GE);
+ if( rc==SQLITE4_INEXACT ) rc = SQLITE4_OK;
+ if( rc!=SQLITE4_OK ) break;
+
+ rc = sqlite4BtCsrKey(pCsr, &pK, &n);
+ if( rc!=SQLITE4_OK ) break;
+
+ nCmp = MIN(n, nKey1);
+ res = memcmp(pKey1, pK, nCmp);
+ assert( res<0 || (res==0 && nKey1<=n) );
+ if( res==0 && nKey1==n ){
+ rc = sqlite4BtCsrNext(pCsr);
+ if( rc!=SQLITE4_OK ) break;
+ rc = sqlite4BtCsrKey(pCsr, &pK, &n);
+ if( rc!=SQLITE4_OK ) break;
+ }
+
+ nCmp = MIN(n, nKey2);
+ res = memcmp(pKey2, pK, nCmp);
+ if( res<0 || (res==0 && nKey2<=n) ) break;
+
+ rc = sqlite4BtDelete(pCsr);
+ }
+ if( rc==SQLITE4_NOTFOUND ) rc = SQLITE4_OK;
+
+ sqlite4BtCsrClose(pCsr);
+
+ rc = btRestoreTransaction(p, iLevel, rc);
+ return rc;
+}
+
+static int bt_fetch(
+ TestDb *pTestDb,
+ void *pK, int nK,
+ void **ppVal, int *pnVal
+){
+ BtDb *p = (BtDb*)pTestDb;
+ bt_cursor *pCsr = 0;
+ int iLevel;
+ int rc = SQLITE4_OK;
+
+ iLevel = sqlite4BtTransactionLevel(p->pBt);
+ if( iLevel==0 ){
+ rc = sqlite4BtBegin(p->pBt, 1);
+ if( rc!=SQLITE4_OK ) return rc;
+ }
+
+ if( p->bFastInsert ) sqlite4BtControl(p->pBt, BT_CONTROL_FAST_INSERT_OP, 0);
+ rc = sqlite4BtCsrOpen(p->pBt, 0, &pCsr);
+ if( rc==SQLITE4_OK ){
+ rc = sqlite4BtCsrSeek(pCsr, pK, nK, BT_SEEK_EQ);
+ if( rc==SQLITE4_OK ){
+ const void *pV = 0;
+ int nV = 0;
+ rc = sqlite4BtCsrData(pCsr, 0, -1, &pV, &nV);
+ if( rc==SQLITE4_OK ){
+ if( nV>p->nBuffer ){
+ free(p->aBuffer);
+ p->aBuffer = (u8*)malloc(nV*2);
+ p->nBuffer = nV*2;
+ }
+ memcpy(p->aBuffer, pV, nV);
+ *pnVal = nV;
+ *ppVal = (void*)(p->aBuffer);
+ }
+
+ }else if( rc==SQLITE4_INEXACT || rc==SQLITE4_NOTFOUND ){
+ *ppVal = 0;
+ *pnVal = -1;
+ rc = SQLITE4_OK;
+ }
+ sqlite4BtCsrClose(pCsr);
+ }
+
+ if( iLevel==0 ) sqlite4BtCommit(p->pBt, 0);
+ return rc;
+}
+
+static int bt_scan(
+ TestDb *pTestDb,
+ void *pCtx,
+ int bReverse,
+ void *pFirst, int nFirst,
+ void *pLast, int nLast,
+ void (*xCallback)(void *, void *, int , void *, int)
+){
+ BtDb *p = (BtDb*)pTestDb;
+ bt_cursor *pCsr = 0;
+ int rc;
+ int iLevel;
+
+ rc = btMinTransaction(p, 1, &iLevel);
+
+ if( rc==SQLITE4_OK ){
+ if( p->bFastInsert ) sqlite4BtControl(p->pBt, BT_CONTROL_FAST_INSERT_OP, 0);
+ rc = sqlite4BtCsrOpen(p->pBt, 0, &pCsr);
+ }
+ if( rc==SQLITE4_OK ){
+ if( bReverse ){
+ if( pLast ){
+ rc = sqlite4BtCsrSeek(pCsr, pLast, nLast, BT_SEEK_LE);
+ }else{
+ rc = sqlite4BtCsrLast(pCsr);
+ }
+ }else{
+ rc = sqlite4BtCsrSeek(pCsr, pFirst, nFirst, BT_SEEK_GE);
+ }
+ if( rc==SQLITE4_INEXACT ) rc = SQLITE4_OK;
+
+ while( rc==SQLITE4_OK ){
+ const void *pK = 0; int nK = 0;
+ const void *pV = 0; int nV = 0;
+
+ rc = sqlite4BtCsrKey(pCsr, &pK, &nK);
+ if( rc==SQLITE4_OK ){
+ rc = sqlite4BtCsrData(pCsr, 0, -1, &pV, &nV);
+ }
+
+ if( rc!=SQLITE4_OK ) break;
+ if( bReverse ){
+ if( pFirst ){
+ int res;
+ int nCmp = MIN(nK, nFirst);
+ res = memcmp(pFirst, pK, nCmp);
+ if( res>0 || (res==0 && nKnLast) ) break;
+ }
+ }
+
+ xCallback(pCtx, (void*)pK, nK, (void*)pV, nV);
+ if( bReverse ){
+ rc = sqlite4BtCsrPrev(pCsr);
+ }else{
+ rc = sqlite4BtCsrNext(pCsr);
+ }
+ }
+ if( rc==SQLITE4_NOTFOUND ) rc = SQLITE4_OK;
+
+ sqlite4BtCsrClose(pCsr);
+ }
+
+ rc = btRestoreTransaction(p, iLevel, rc);
+ return rc;
+}
+
+static int bt_begin(TestDb *pTestDb, int iLvl){
+ BtDb *p = (BtDb*)pTestDb;
+ int rc = sqlite4BtBegin(p->pBt, iLvl);
+ return rc;
+}
+
+static int bt_commit(TestDb *pTestDb, int iLvl){
+ BtDb *p = (BtDb*)pTestDb;
+ int rc = sqlite4BtCommit(p->pBt, iLvl);
+ return rc;
+}
+
+static int bt_rollback(TestDb *pTestDb, int iLvl){
+ BtDb *p = (BtDb*)pTestDb;
+ int rc = sqlite4BtRollback(p->pBt, iLvl);
+ return rc;
+}
+
+static int testParseOption(
+ const char **pzIn, /* IN/OUT: pointer to next option */
+ const char **pzOpt, /* OUT: nul-terminated option name */
+ const char **pzArg, /* OUT: nul-terminated option argument */
+ char *pSpace /* Temporary space for output params */
+){
+ const char *p = *pzIn;
+ const char *pStart;
+ int n;
+
+ char *pOut = pSpace;
+
+ while( *p==' ' ) p++;
+ pStart = p;
+ while( *p && *p!='=' ) p++;
+ if( *p==0 ) return 1;
+
+ n = (p - pStart);
+ memcpy(pOut, pStart, n);
+ *pzOpt = pOut;
+ pOut += n;
+ *pOut++ = '\0';
+
+ p++;
+ pStart = p;
+ while( *p && *p!=' ' ) p++;
+ n = (p - pStart);
+
+ memcpy(pOut, pStart, n);
+ *pzArg = pOut;
+ pOut += n;
+ *pOut++ = '\0';
+
+ *pzIn = p;
+ return 0;
+}
+
+static int testParseInt(const char *z, int *piVal){
+ int i = 0;
+ const char *p = z;
+
+ while( *p>='0' && *p<='9' ){
+ i = i*10 + (*p - '0');
+ p++;
+ }
+ if( *p=='K' || *p=='k' ){
+ i = i * 1024;
+ p++;
+ }else if( *p=='M' || *p=='m' ){
+ i = i * 1024 * 1024;
+ p++;
+ }
+
+ if( *p ) return SQLITE4_ERROR;
+ *piVal = i;
+ return SQLITE4_OK;
+}
+
+static int testBtConfigure(BtDb *pDb, const char *zCfg, int *pbMt){
+ int rc = SQLITE4_OK;
+
+ if( zCfg ){
+ struct CfgParam {
+ const char *zParam;
+ int eParam;
+ } aParam[] = {
+ { "safety", BT_CONTROL_SAFETY },
+ { "autockpt", BT_CONTROL_AUTOCKPT },
+ { "multiproc", BT_CONTROL_MULTIPROC },
+ { "blksz", BT_CONTROL_BLKSZ },
+ { "pagesz", BT_CONTROL_PAGESZ },
+ { "mt", -1 },
+ { "fastinsert", -2 },
+ { 0, 0 }
+ };
+ const char *z = zCfg;
+ int n = strlen(z);
+ char *aSpace;
+ const char *zOpt;
+ const char *zArg;
+
+ aSpace = (char*)testMalloc(n+2);
+ while( rc==SQLITE4_OK && 0==testParseOption(&z, &zOpt, &zArg, aSpace) ){
+ int i;
+ int iVal;
+ rc = testArgSelect(aParam, "param", zOpt, &i);
+ if( rc!=SQLITE4_OK ) break;
+
+ rc = testParseInt(zArg, &iVal);
+ if( rc!=SQLITE4_OK ) break;
+
+ switch( aParam[i].eParam ){
+ case -1:
+ *pbMt = iVal;
+ break;
+ case -2:
+ pDb->bFastInsert = 1;
+ break;
+ default:
+ rc = sqlite4BtControl(pDb->pBt, aParam[i].eParam, (void*)&iVal);
+ break;
+ }
+ }
+ testFree(aSpace);
+ }
+
+ return rc;
+}
+
+
+int test_bt_open(
+ const char *zSpec,
+ const char *zFilename,
+ int bClear,
+ TestDb **ppDb
+){
+
+ static const DatabaseMethods SqlMethods = {
+ bt_close,
+ bt_write,
+ bt_delete,
+ bt_delete_range,
+ bt_fetch,
+ bt_scan,
+ bt_begin,
+ bt_commit,
+ bt_rollback
+ };
+ BtDb *p = 0;
+ bt_db *pBt = 0;
+ int rc;
+ sqlite4_env *pEnv = sqlite4_env_default();
+
+ if( bClear && zFilename && zFilename[0] ){
+ char *zLog = sqlite3_mprintf("%s-wal", zFilename);
+ unlink(zFilename);
+ unlink(zLog);
+ sqlite3_free(zLog);
+ }
+
+ rc = sqlite4BtNew(pEnv, 0, &pBt);
+ if( rc==SQLITE4_OK ){
+ int mt = 0; /* True for multi-threaded connection */
+
+ p = (BtDb*)testMalloc(sizeof(BtDb));
+ p->base.pMethods = &SqlMethods;
+ p->pBt = pBt;
+ p->pEnv = pEnv;
+ p->nRef = 1;
+
+ p->env.pVfsCtx = (void*)p;
+ p->env.xFullpath = btVfsFullpath;
+ p->env.xOpen = btVfsOpen;
+ p->env.xSize = btVfsSize;
+ p->env.xRead = btVfsRead;
+ p->env.xWrite = btVfsWrite;
+ p->env.xTruncate = btVfsTruncate;
+ p->env.xSync = btVfsSync;
+ p->env.xSectorSize = btVfsSectorSize;
+ p->env.xClose = btVfsClose;
+ p->env.xUnlink = btVfsUnlink;
+ p->env.xLock = btVfsLock;
+ p->env.xTestLock = btVfsTestLock;
+ p->env.xShmMap = btVfsShmMap;
+ p->env.xShmBarrier = btVfsShmBarrier;
+ p->env.xShmUnmap = btVfsShmUnmap;
+
+ sqlite4BtControl(pBt, BT_CONTROL_GETVFS, (void*)&p->pVfs);
+ sqlite4BtControl(pBt, BT_CONTROL_SETVFS, (void*)&p->env);
+
+ rc = testBtConfigure(p, zSpec, &mt);
+ if( rc==SQLITE4_OK ){
+ rc = sqlite4BtOpen(pBt, zFilename);
+ }
+
+ if( rc==SQLITE4_OK && mt ){
+ int nAuto = 0;
+ rc = bgc_attach(p, zSpec);
+ sqlite4BtControl(pBt, BT_CONTROL_AUTOCKPT, (void*)&nAuto);
+ }
+ }
+
+ if( rc!=SQLITE4_OK && p ){
+ bt_close(&p->base);
+ }
+
+ *ppDb = &p->base;
+ return rc;
+}
+
+int test_fbt_open(
+ const char *zSpec,
+ const char *zFilename,
+ int bClear,
+ TestDb **ppDb
+){
+ return test_bt_open("fast=1", zFilename, bClear, ppDb);
+}
+
+int test_fbts_open(
+ const char *zSpec,
+ const char *zFilename,
+ int bClear,
+ TestDb **ppDb
+){
+ return test_bt_open("fast=1 blksz=32K pagesz=512", zFilename, bClear, ppDb);
+}
+
+
+void tdb_bt_prepare_sync_crash(TestDb *pTestDb, int iSync){
+ BtDb *p = (BtDb*)pTestDb;
+ assert( pTestDb->pMethods->xClose==bt_close );
+ assert( p->bCrash==0 );
+ p->nCrashSync = iSync;
+}
+
+bt_db *tdb_bt(TestDb *pDb){
+ if( pDb->pMethods->xClose==bt_close ){
+ return ((BtDb *)pDb)->pBt;
+ }
+ return 0;
+}
+
+/*************************************************************************
+** Beginning of code for background checkpointer.
+*/
+
+struct bt_ckpter {
+ sqlite4_buffer file; /* File name */
+ sqlite4_buffer spec; /* Options */
+ int nLogsize; /* Minimum log size to checkpoint */
+ int nRef; /* Number of clients */
+
+ int bDoWork; /* Set by client threads */
+ pthread_t ckpter_thread; /* Checkpointer thread */
+ pthread_cond_t ckpter_cond; /* Condition var the ckpter waits on */
+ pthread_mutex_t ckpter_mutex; /* Mutex used with ckpter_cond */
+
+ bt_ckpter *pNext; /* Next object in list at gBgc.pCkpter */
+};
+
+static struct GlobalBackgroundCheckpointer {
+ bt_ckpter *pCkpter; /* Linked list of checkpointers */
+} gBgc;
+
+static void *bgc_main(void *pArg){
+ BtDb *pDb = 0;
+ int rc;
+ int mt;
+ bt_ckpter *pCkpter = (bt_ckpter*)pArg;
+
+ rc = test_bt_open("", (char*)pCkpter->file.p, 0, (TestDb**)&pDb);
+ assert( rc==SQLITE4_OK );
+ rc = testBtConfigure(pDb, (char*)pCkpter->spec.p, &mt);
+
+ while( pCkpter->nRef>0 ){
+ bt_db *db = pDb->pBt;
+ int nLog = 0;
+
+ sqlite4BtBegin(db, 1);
+ sqlite4BtCommit(db, 0);
+ sqlite4BtControl(db, BT_CONTROL_LOGSIZE, (void*)&nLog);
+
+ if( nLog>=pCkpter->nLogsize ){
+ int rc;
+ bt_checkpoint ckpt;
+ memset(&ckpt, 0, sizeof(bt_checkpoint));
+ ckpt.nFrameBuffer = nLog/2;
+ rc = sqlite4BtControl(db, BT_CONTROL_CHECKPOINT, (void*)&ckpt);
+ assert( rc==SQLITE4_OK );
+ sqlite4BtControl(db, BT_CONTROL_LOGSIZE, (void*)&nLog);
+ }
+
+ /* The thread will wake up when it is signaled either because another
+ ** thread has created some work for this one or because the connection
+ ** is being closed. */
+ pthread_mutex_lock(&pCkpter->ckpter_mutex);
+ if( pCkpter->bDoWork==0 ){
+ pthread_cond_wait(&pCkpter->ckpter_cond, &pCkpter->ckpter_mutex);
+ }
+ pCkpter->bDoWork = 0;
+ pthread_mutex_unlock(&pCkpter->ckpter_mutex);
+ }
+
+ if( pDb ) bt_close((TestDb*)pDb);
+ return 0;
+}
+
+static void bgc_logsize_cb(void *pCtx, int nLogsize){
+ bt_ckpter *p = (bt_ckpter*)pCtx;
+ if( nLogsize>=p->nLogsize ){
+ pthread_mutex_lock(&p->ckpter_mutex);
+ p->bDoWork = 1;
+ pthread_cond_signal(&p->ckpter_cond);
+ pthread_mutex_unlock(&p->ckpter_mutex);
+ }
+}
+
+static int bgc_attach(BtDb *pDb, const char *zSpec){
+ int rc;
+ int n;
+ bt_info info;
+ bt_ckpter *pCkpter;
+
+ /* Figure out the full path to the database opened by handle pDb. */
+ info.eType = BT_INFO_FILENAME;
+ info.pgno = 0;
+ sqlite4_buffer_init(&info.output, 0);
+ rc = sqlite4BtControl(pDb->pBt, BT_CONTROL_INFO, (void*)&info);
+ if( rc!=SQLITE4_OK ) return rc;
+
+ sqlite4_mutex_enter(sqlite4_mutex_alloc(pDb->pEnv, SQLITE4_MUTEX_STATIC_KV));
+
+ /* Search for an existing bt_ckpter object. */
+ n = info.output.n;
+ for(pCkpter=gBgc.pCkpter; pCkpter; pCkpter=pCkpter->pNext){
+ if( n==pCkpter->file.n && 0==memcmp(info.output.p, pCkpter->file.p, n) ){
+ break;
+ }
+ }
+
+ /* Failed to find a suitable checkpointer. Create a new one. */
+ if( pCkpter==0 ){
+ bt_logsizecb cb;
+
+ pCkpter = testMalloc(sizeof(bt_ckpter));
+ memcpy(&pCkpter->file, &info.output, sizeof(sqlite4_buffer));
+ info.output.p = 0;
+ pCkpter->pNext = gBgc.pCkpter;
+ pCkpter->nLogsize = 1000;
+ gBgc.pCkpter = pCkpter;
+ pCkpter->nRef = 1;
+
+ sqlite4_buffer_init(&pCkpter->spec, 0);
+ rc = sqlite4_buffer_set(&pCkpter->spec, zSpec, strlen(zSpec)+1);
+ assert( rc==SQLITE4_OK );
+
+ /* Kick off the checkpointer thread. */
+ if( rc==0 ) rc = pthread_cond_init(&pCkpter->ckpter_cond, 0);
+ if( rc==0 ) rc = pthread_mutex_init(&pCkpter->ckpter_mutex, 0);
+ if( rc==0 ){
+ rc = pthread_create(&pCkpter->ckpter_thread, 0, bgc_main, (void*)pCkpter);
+ }
+ assert( rc==0 ); /* todo: Fix this */
+
+ /* Set up the logsize callback for the client thread */
+ cb.pCtx = (void*)pCkpter;
+ cb.xLogsize = bgc_logsize_cb;
+ sqlite4BtControl(pDb->pBt, BT_CONTROL_LOGSIZECB, (void*)&cb);
+ }else{
+ pCkpter->nRef++;
+ }
+
+ /* Assuming a checkpointer was encountered or effected, attach the
+ ** connection to it. */
+ if( pCkpter ){
+ pDb->pCkpter = pCkpter;
+ }
+
+ sqlite4_mutex_leave(sqlite4_mutex_alloc(pDb->pEnv, SQLITE4_MUTEX_STATIC_KV));
+ sqlite4_buffer_clear(&info.output);
+ return rc;
+}
+
+static int bgc_detach(BtDb *pDb){
+ int rc = SQLITE4_OK;
+ bt_ckpter *pCkpter = pDb->pCkpter;
+ if( pCkpter ){
+ int bShutdown = 0; /* True if this is the last reference */
+
+ sqlite4_mutex_enter(sqlite4_mutex_alloc(pDb->pEnv,SQLITE4_MUTEX_STATIC_KV));
+ pCkpter->nRef--;
+ if( pCkpter->nRef==0 ){
+ bt_ckpter **pp;
+
+ *pp = pCkpter->pNext;
+ for(pp=&gBgc.pCkpter; *pp!=pCkpter; pp=&((*pp)->pNext));
+ bShutdown = 1;
+ }
+ sqlite4_mutex_leave(sqlite4_mutex_alloc(pDb->pEnv,SQLITE4_MUTEX_STATIC_KV));
+
+ if( bShutdown ){
+ void *pDummy;
+
+ /* Signal the checkpointer thread. */
+ pthread_mutex_lock(&pCkpter->ckpter_mutex);
+ pCkpter->bDoWork = 1;
+ pthread_cond_signal(&pCkpter->ckpter_cond);
+ pthread_mutex_unlock(&pCkpter->ckpter_mutex);
+
+ /* Join the checkpointer thread. */
+ pthread_join(pCkpter->ckpter_thread, &pDummy);
+ pthread_cond_destroy(&pCkpter->ckpter_cond);
+ pthread_mutex_destroy(&pCkpter->ckpter_mutex);
+
+ sqlite4_buffer_clear(&pCkpter->file);
+ sqlite4_buffer_clear(&pCkpter->spec);
+ testFree(pCkpter);
+ }
+
+ pDb->pCkpter = 0;
+ }
+ return rc;
+}
+
+/*
+** End of background checkpointer.
+*************************************************************************/
+
+
diff --git a/ext/lsm1/lsm-test/lsmtest_util.c b/ext/lsm1/lsm-test/lsmtest_util.c
new file mode 100644
index 0000000..adab8a5
--- /dev/null
+++ b/ext/lsm1/lsm-test/lsmtest_util.c
@@ -0,0 +1,223 @@
+
+#include "lsmtest.h"
+#include
+#include
+#include
+#ifndef _WIN32
+# include
+#endif
+
+/*
+** Global variables used within this module.
+*/
+static struct TestutilGlobal {
+ char **argv;
+ int argc;
+} g = {0, 0};
+
+static struct TestutilRnd {
+ unsigned int aRand1[2048]; /* Bits 0..10 */
+ unsigned int aRand2[2048]; /* Bits 11..21 */
+ unsigned int aRand3[1024]; /* Bits 22..31 */
+} r;
+
+/*************************************************************************
+** The following block is a copy of the implementation of SQLite function
+** sqlite3_randomness. This version has two important differences:
+**
+** 1. It always uses the same seed. So the sequence of random data output
+** is the same for every run of the program.
+**
+** 2. It is not threadsafe.
+*/
+static struct sqlite3PrngType {
+ unsigned char i, j; /* State variables */
+ unsigned char s[256]; /* State variables */
+} sqlite3Prng = {
+ 0xAF, 0x28,
+ {
+ 0x71, 0xF5, 0xB4, 0x6E, 0x80, 0xAB, 0x1D, 0xB8,
+ 0xFB, 0xB7, 0x49, 0xBF, 0xFF, 0x72, 0x2D, 0x14,
+ 0x79, 0x09, 0xE3, 0x78, 0x76, 0xB0, 0x2C, 0x0A,
+ 0x8E, 0x23, 0xEE, 0xDF, 0xE0, 0x9A, 0x2F, 0x67,
+ 0xE1, 0xBE, 0x0E, 0xA7, 0x08, 0x97, 0xEB, 0x77,
+ 0x78, 0xBA, 0x9D, 0xCA, 0x49, 0x4C, 0x60, 0x9A,
+ 0xF6, 0xBD, 0xDA, 0x7F, 0xBC, 0x48, 0x58, 0x52,
+ 0xE5, 0xCD, 0x83, 0x72, 0x23, 0x52, 0xFF, 0x6D,
+ 0xEF, 0x0F, 0x82, 0x29, 0xA0, 0x83, 0x3F, 0x7D,
+ 0xA4, 0x88, 0x31, 0xE7, 0x88, 0x92, 0x3B, 0x9B,
+ 0x3B, 0x2C, 0xC2, 0x4C, 0x71, 0xA2, 0xB0, 0xEA,
+ 0x36, 0xD0, 0x00, 0xF1, 0xD3, 0x39, 0x17, 0x5D,
+ 0x2A, 0x7A, 0xE4, 0xAD, 0xE1, 0x64, 0xCE, 0x0F,
+ 0x9C, 0xD9, 0xF5, 0xED, 0xB0, 0x22, 0x5E, 0x62,
+ 0x97, 0x02, 0xA3, 0x8C, 0x67, 0x80, 0xFC, 0x88,
+ 0x14, 0x0B, 0x15, 0x10, 0x0F, 0xC7, 0x40, 0xD4,
+ 0xF1, 0xF9, 0x0E, 0x1A, 0xCE, 0xB9, 0x1E, 0xA1,
+ 0x72, 0x8E, 0xD7, 0x78, 0x39, 0xCD, 0xF4, 0x5D,
+ 0x2A, 0x59, 0x26, 0x34, 0xF2, 0x73, 0x0B, 0xA0,
+ 0x02, 0x51, 0x2C, 0x03, 0xA3, 0xA7, 0x43, 0x13,
+ 0xE8, 0x98, 0x2B, 0xD2, 0x53, 0xF8, 0xEE, 0x91,
+ 0x7D, 0xE7, 0xE3, 0xDA, 0xD5, 0xBB, 0xC0, 0x92,
+ 0x9D, 0x98, 0x01, 0x2C, 0xF9, 0xB9, 0xA0, 0xEB,
+ 0xCF, 0x32, 0xFA, 0x01, 0x49, 0xA5, 0x1D, 0x9A,
+ 0x76, 0x86, 0x3F, 0x40, 0xD4, 0x89, 0x8F, 0x9C,
+ 0xE2, 0xE3, 0x11, 0x31, 0x37, 0xB2, 0x49, 0x28,
+ 0x35, 0xC0, 0x99, 0xB6, 0xD0, 0xBC, 0x66, 0x35,
+ 0xF7, 0x83, 0x5B, 0xD7, 0x37, 0x1A, 0x2B, 0x18,
+ 0xA6, 0xFF, 0x8D, 0x7C, 0x81, 0xA8, 0xFC, 0x9E,
+ 0xC4, 0xEC, 0x80, 0xD0, 0x98, 0xA7, 0x76, 0xCC,
+ 0x9C, 0x2F, 0x7B, 0xFF, 0x8E, 0x0E, 0xBB, 0x90,
+ 0xAE, 0x13, 0x06, 0xF5, 0x1C, 0x4E, 0x52, 0xF7
+ }
+};
+
+/* Generate and return single random byte */
+static unsigned char randomByte(void){
+ unsigned char t;
+ sqlite3Prng.i++;
+ t = sqlite3Prng.s[sqlite3Prng.i];
+ sqlite3Prng.j += t;
+ sqlite3Prng.s[sqlite3Prng.i] = sqlite3Prng.s[sqlite3Prng.j];
+ sqlite3Prng.s[sqlite3Prng.j] = t;
+ t += sqlite3Prng.s[sqlite3Prng.i];
+ return sqlite3Prng.s[t];
+}
+
+/*
+** Return N random bytes.
+*/
+static void randomBlob(int nBuf, unsigned char *zBuf){
+ int i;
+ for(i=0; i>11) & 0x000007FF] ^
+ r.aRand3[(iVal>>22) & 0x000003FF]
+ ;
+}
+
+void testPrngArray(unsigned int iVal, unsigned int *aOut, int nOut){
+ int i;
+ for(i=0; izName;
+ pEntry=(struct Entry *)&((unsigned char *)pEntry)[sz]
+ ){
+ if( zPrev ){ testPrintError("%s, ", zPrev); }
+ zPrev = pEntry->zName;
+ }
+ testPrintError("or %s\n", zPrev);
+}
+
+int testArgSelectX(
+ void *aData,
+ const char *zType,
+ int sz,
+ const char *zArg,
+ int *piOut
+){
+ struct Entry { const char *zName; };
+ struct Entry *pEntry;
+ int nArg = strlen(zArg);
+
+ int i = 0;
+ int iOut = -1;
+ int nOut = 0;
+
+ for(pEntry=(struct Entry *)aData;
+ pEntry->zName;
+ pEntry=(struct Entry *)&((unsigned char *)pEntry)[sz]
+ ){
+ int nName = strlen(pEntry->zName);
+ if( nArg<=nName && memcmp(pEntry->zName, zArg, nArg)==0 ){
+ iOut = i;
+ if( nName==nArg ){
+ nOut = 1;
+ break;
+ }
+ nOut++;
+ }
+ i++;
+ }
+
+ if( nOut!=1 ){
+ argError(aData, zType, sz, zArg);
+ }else{
+ *piOut = iOut;
+ }
+ return (nOut!=1);
+}
+
+struct timeval zero_time;
+
+void testTimeInit(void){
+ gettimeofday(&zero_time, 0);
+}
+
+int testTimeGet(void){
+ struct timeval now;
+ gettimeofday(&now, 0);
+ return
+ (((int)now.tv_sec - (int)zero_time.tv_sec)*1000) +
+ (((int)now.tv_usec - (int)zero_time.tv_usec)/1000);
+}
diff --git a/ext/lsm1/lsm-test/lsmtest_win32.c b/ext/lsm1/lsm-test/lsmtest_win32.c
new file mode 100644
index 0000000..9472723
--- /dev/null
+++ b/ext/lsm1/lsm-test/lsmtest_win32.c
@@ -0,0 +1,30 @@
+
+#include "lsmtest.h"
+
+#ifdef _WIN32
+
+#define TICKS_PER_SECOND (10000000)
+#define TICKS_PER_MICROSECOND (10)
+#define TICKS_UNIX_EPOCH (116444736000000000LL)
+
+int win32GetTimeOfDay(
+ struct timeval *tp,
+ void *tzp
+){
+ FILETIME fileTime;
+ ULONGLONG ticks;
+ ULONGLONG unixTicks;
+
+ unused_parameter(tzp);
+ memset(&fileTime, 0, sizeof(FILETIME));
+ GetSystemTimeAsFileTime(&fileTime);
+ ticks = (ULONGLONG)fileTime.dwHighDateTime << 32;
+ ticks |= (ULONGLONG)fileTime.dwLowDateTime;
+ unixTicks = ticks - TICKS_UNIX_EPOCH;
+ tp->tv_sec = (long)(unixTicks / TICKS_PER_SECOND);
+ unixTicks -= ((ULONGLONG)tp->tv_sec * TICKS_PER_SECOND);
+ tp->tv_usec = (long)(unixTicks / TICKS_PER_MICROSECOND);
+
+ return 0;
+}
+#endif
diff --git a/ext/lsm1/lsm.h b/ext/lsm1/lsm.h
new file mode 100644
index 0000000..48701c4
--- /dev/null
+++ b/ext/lsm1/lsm.h
@@ -0,0 +1,684 @@
+/*
+** 2011-08-10
+**
+** The author disclaims copyright to this source code. In place of
+** a legal notice, here is a blessing:
+**
+** May you do good and not evil.
+** May you find forgiveness for yourself and forgive others.
+** May you share freely, never taking more than you give.
+**
+*************************************************************************
+**
+** This file defines the LSM API.
+*/
+#ifndef _LSM_H
+#define _LSM_H
+#include
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*
+** Opaque handle types.
+*/
+typedef struct lsm_compress lsm_compress; /* Compression library functions */
+typedef struct lsm_compress_factory lsm_compress_factory;
+typedef struct lsm_cursor lsm_cursor; /* Database cursor handle */
+typedef struct lsm_db lsm_db; /* Database connection handle */
+typedef struct lsm_env lsm_env; /* Runtime environment */
+typedef struct lsm_file lsm_file; /* OS file handle */
+typedef struct lsm_mutex lsm_mutex; /* Mutex handle */
+
+/* 64-bit integer type used for file offsets. */
+typedef long long int lsm_i64; /* 64-bit signed integer type */
+
+/* Candidate values for the 3rd argument to lsm_env.xLock() */
+#define LSM_LOCK_UNLOCK 0
+#define LSM_LOCK_SHARED 1
+#define LSM_LOCK_EXCL 2
+
+/* Flags for lsm_env.xOpen() */
+#define LSM_OPEN_READONLY 0x0001
+
+/*
+** CAPI: Database Runtime Environment
+**
+** Run-time environment used by LSM
+*/
+struct lsm_env {
+ int nByte; /* Size of this structure in bytes */
+ int iVersion; /* Version number of this structure (1) */
+ /****** file i/o ***********************************************/
+ void *pVfsCtx;
+ int (*xFullpath)(lsm_env*, const char *, char *, int *);
+ int (*xOpen)(lsm_env*, const char *, int flags, lsm_file **);
+ int (*xRead)(lsm_file *, lsm_i64, void *, int);
+ int (*xWrite)(lsm_file *, lsm_i64, void *, int);
+ int (*xTruncate)(lsm_file *, lsm_i64);
+ int (*xSync)(lsm_file *);
+ int (*xSectorSize)(lsm_file *);
+ int (*xRemap)(lsm_file *, lsm_i64, void **, lsm_i64*);
+ int (*xFileid)(lsm_file *, void *pBuf, int *pnBuf);
+ int (*xClose)(lsm_file *);
+ int (*xUnlink)(lsm_env*, const char *);
+ int (*xLock)(lsm_file*, int, int);
+ int (*xTestLock)(lsm_file*, int, int, int);
+ int (*xShmMap)(lsm_file*, int, int, void **);
+ void (*xShmBarrier)(void);
+ int (*xShmUnmap)(lsm_file*, int);
+ /****** memory allocation ****************************************/
+ void *pMemCtx;
+ void *(*xMalloc)(lsm_env*, size_t); /* malloc(3) function */
+ void *(*xRealloc)(lsm_env*, void *, size_t); /* realloc(3) function */
+ void (*xFree)(lsm_env*, void *); /* free(3) function */
+ size_t (*xSize)(lsm_env*, void *); /* xSize function */
+ /****** mutexes ****************************************************/
+ void *pMutexCtx;
+ int (*xMutexStatic)(lsm_env*,int,lsm_mutex**); /* Obtain a static mutex */
+ int (*xMutexNew)(lsm_env*, lsm_mutex**); /* Get a new dynamic mutex */
+ void (*xMutexDel)(lsm_mutex *); /* Delete an allocated mutex */
+ void (*xMutexEnter)(lsm_mutex *); /* Grab a mutex */
+ int (*xMutexTry)(lsm_mutex *); /* Attempt to obtain a mutex */
+ void (*xMutexLeave)(lsm_mutex *); /* Leave a mutex */
+ int (*xMutexHeld)(lsm_mutex *); /* Return true if mutex is held */
+ int (*xMutexNotHeld)(lsm_mutex *); /* Return true if mutex not held */
+ /****** other ****************************************************/
+ int (*xSleep)(lsm_env*, int microseconds);
+
+ /* New fields may be added in future releases, in which case the
+ ** iVersion value will increase. */
+};
+
+/*
+** Values that may be passed as the second argument to xMutexStatic.
+*/
+#define LSM_MUTEX_GLOBAL 1
+#define LSM_MUTEX_HEAP 2
+
+/*
+** CAPI: LSM Error Codes
+*/
+#define LSM_OK 0
+#define LSM_ERROR 1
+#define LSM_BUSY 5
+#define LSM_NOMEM 7
+#define LSM_READONLY 8
+#define LSM_IOERR 10
+#define LSM_CORRUPT 11
+#define LSM_FULL 13
+#define LSM_CANTOPEN 14
+#define LSM_PROTOCOL 15
+#define LSM_MISUSE 21
+
+#define LSM_MISMATCH 50
+
+
+#define LSM_IOERR_NOENT (LSM_IOERR | (1<<8))
+
+/*
+** CAPI: Creating and Destroying Database Connection Handles
+**
+** Open and close a database connection handle.
+*/
+int lsm_new(lsm_env*, lsm_db **ppDb);
+int lsm_close(lsm_db *pDb);
+
+/*
+** CAPI: Connecting to a Database
+*/
+int lsm_open(lsm_db *pDb, const char *zFilename);
+
+/*
+** CAPI: Obtaining pointers to database environments
+**
+** Return a pointer to the environment used by the database connection
+** passed as the first argument. Assuming the argument is valid, this
+** function always returns a valid environment pointer - it cannot fail.
+*/
+lsm_env *lsm_get_env(lsm_db *pDb);
+
+/*
+** The lsm_default_env() function returns a pointer to the default LSM
+** environment for the current platform.
+*/
+lsm_env *lsm_default_env(void);
+
+
+/*
+** CAPI: Configuring a database connection.
+**
+** The lsm_config() function is used to configure a database connection.
+*/
+int lsm_config(lsm_db *, int, ...);
+
+/*
+** The following values may be passed as the second argument to lsm_config().
+**
+** LSM_CONFIG_AUTOFLUSH:
+** A read/write integer parameter.
+**
+** This value determines the amount of data allowed to accumulate in a
+** live in-memory tree before it is marked as old. After committing a
+** transaction, a connection checks if the size of the live in-memory tree,
+** including data structure overhead, is greater than the value of this
+** option in KB. If it is, and there is not already an old in-memory tree,
+** the live in-memory tree is marked as old.
+**
+** The maximum allowable value is 1048576 (1GB). There is no minimum
+** value. If this parameter is set to zero, then an attempt is made to
+** mark the live in-memory tree as old after each transaction is committed.
+**
+** The default value is 1024 (1MB).
+**
+** LSM_CONFIG_PAGE_SIZE:
+** A read/write integer parameter. This parameter may only be set before
+** lsm_open() has been called.
+**
+** LSM_CONFIG_BLOCK_SIZE:
+** A read/write integer parameter.
+**
+** This parameter may only be set before lsm_open() has been called. It
+** must be set to a power of two between 64 and 65536, inclusive (block
+** sizes between 64KB and 64MB).
+**
+** If the connection creates a new database, the block size of the new
+** database is set to the value of this option in KB. After lsm_open()
+** has been called, querying this parameter returns the actual block
+** size of the opened database.
+**
+** The default value is 1024 (1MB blocks).
+**
+** LSM_CONFIG_SAFETY:
+** A read/write integer parameter. Valid values are 0, 1 (the default)
+** and 2. This parameter determines how robust the database is in the
+** face of a system crash (e.g. a power failure or operating system
+** crash). As follows:
+**
+** 0 (off): No robustness. A system crash may corrupt the database.
+**
+** 1 (normal): Some robustness. A system crash may not corrupt the
+** database file, but recently committed transactions may
+** be lost following recovery.
+**
+** 2 (full): Full robustness. A system crash may not corrupt the
+** database file. Following recovery the database file
+** contains all successfully committed transactions.
+**
+** LSM_CONFIG_AUTOWORK:
+** A read/write integer parameter.
+**
+** LSM_CONFIG_AUTOCHECKPOINT:
+** A read/write integer parameter.
+**
+** If this option is set to non-zero value N, then a checkpoint is
+** automatically attempted after each N KB of data have been written to
+** the database file.
+**
+** The amount of uncheckpointed data already written to the database file
+** is a global parameter. After performing database work (writing to the
+** database file), the process checks if the total amount of uncheckpointed
+** data exceeds the value of this paramter. If so, a checkpoint is performed.
+** This means that this option may cause the connection to perform a
+** checkpoint even if the current connection has itself written very little
+** data into the database file.
+**
+** The default value is 2048 (checkpoint every 2MB).
+**
+** LSM_CONFIG_MMAP:
+** A read/write integer parameter. If this value is set to 0, then the
+** database file is accessed using ordinary read/write IO functions. Or,
+** if it is set to 1, then the database file is memory mapped and accessed
+** that way. If this parameter is set to any value N greater than 1, then
+** up to the first N KB of the file are memory mapped, and any remainder
+** accessed using read/write IO.
+**
+** The default value is 1 on 64-bit platforms and 32768 on 32-bit platforms.
+**
+**
+** LSM_CONFIG_USE_LOG:
+** A read/write boolean parameter. True (the default) to use the log
+** file normally. False otherwise.
+**
+** LSM_CONFIG_AUTOMERGE:
+** A read/write integer parameter. The minimum number of segments to
+** merge together at a time. Default value 4.
+**
+** LSM_CONFIG_MAX_FREELIST:
+** A read/write integer parameter. The maximum number of free-list
+** entries that are stored in a database checkpoint (the others are
+** stored elsewhere in the database).
+**
+** There is no reason for an application to configure or query this
+** parameter. It is only present because configuring a small value
+** makes certain parts of the lsm code easier to test.
+**
+** LSM_CONFIG_MULTIPLE_PROCESSES:
+** A read/write boolean parameter. This parameter may only be set before
+** lsm_open() has been called. If true, the library uses shared-memory
+** and posix advisory locks to co-ordinate access by clients from within
+** multiple processes. Otherwise, if false, all database clients must be
+** located in the same process. The default value is true.
+**
+** LSM_CONFIG_SET_COMPRESSION:
+** Set the compression methods used to compress and decompress database
+** content. The argument to this option should be a pointer to a structure
+** of type lsm_compress. The lsm_config() method takes a copy of the
+** structures contents.
+**
+** This option may only be used before lsm_open() is called. Invoking it
+** after lsm_open() has been called results in an LSM_MISUSE error.
+**
+** LSM_CONFIG_GET_COMPRESSION:
+** Query the compression methods used to compress and decompress database
+** content.
+**
+** LSM_CONFIG_SET_COMPRESSION_FACTORY:
+** Configure a factory method to be invoked in case of an LSM_MISMATCH
+** error.
+**
+** LSM_CONFIG_READONLY:
+** A read/write boolean parameter. This parameter may only be set before
+** lsm_open() is called.
+*/
+#define LSM_CONFIG_AUTOFLUSH 1
+#define LSM_CONFIG_PAGE_SIZE 2
+#define LSM_CONFIG_SAFETY 3
+#define LSM_CONFIG_BLOCK_SIZE 4
+#define LSM_CONFIG_AUTOWORK 5
+#define LSM_CONFIG_MMAP 7
+#define LSM_CONFIG_USE_LOG 8
+#define LSM_CONFIG_AUTOMERGE 9
+#define LSM_CONFIG_MAX_FREELIST 10
+#define LSM_CONFIG_MULTIPLE_PROCESSES 11
+#define LSM_CONFIG_AUTOCHECKPOINT 12
+#define LSM_CONFIG_SET_COMPRESSION 13
+#define LSM_CONFIG_GET_COMPRESSION 14
+#define LSM_CONFIG_SET_COMPRESSION_FACTORY 15
+#define LSM_CONFIG_READONLY 16
+
+#define LSM_SAFETY_OFF 0
+#define LSM_SAFETY_NORMAL 1
+#define LSM_SAFETY_FULL 2
+
+/*
+** CAPI: Compression and/or Encryption Hooks
+*/
+struct lsm_compress {
+ void *pCtx;
+ unsigned int iId;
+ int (*xBound)(void *, int nSrc);
+ int (*xCompress)(void *, char *, int *, const char *, int);
+ int (*xUncompress)(void *, char *, int *, const char *, int);
+ void (*xFree)(void *pCtx);
+};
+
+struct lsm_compress_factory {
+ void *pCtx;
+ int (*xFactory)(void *, lsm_db *, unsigned int);
+ void (*xFree)(void *pCtx);
+};
+
+#define LSM_COMPRESSION_EMPTY 0
+#define LSM_COMPRESSION_NONE 1
+
+/*
+** CAPI: Allocating and Freeing Memory
+**
+** Invoke the memory allocation functions that belong to environment
+** pEnv. Or the system defaults if no memory allocation functions have
+** been registered.
+*/
+void *lsm_malloc(lsm_env*, size_t);
+void *lsm_realloc(lsm_env*, void *, size_t);
+void lsm_free(lsm_env*, void *);
+
+/*
+** CAPI: Querying a Connection For Operational Data
+**
+** Query a database connection for operational statistics or data.
+*/
+int lsm_info(lsm_db *, int, ...);
+
+int lsm_get_user_version(lsm_db *, unsigned int *);
+int lsm_set_user_version(lsm_db *, unsigned int);
+
+/*
+** The following values may be passed as the second argument to lsm_info().
+**
+** LSM_INFO_NWRITE:
+** The third parameter should be of type (int *). The location pointed
+** to by the third parameter is set to the number of 4KB pages written to
+** the database file during the lifetime of this connection.
+**
+** LSM_INFO_NREAD:
+** The third parameter should be of type (int *). The location pointed
+** to by the third parameter is set to the number of 4KB pages read from
+** the database file during the lifetime of this connection.
+**
+** LSM_INFO_DB_STRUCTURE:
+** The third argument should be of type (char **). The location pointed
+** to is populated with a pointer to a nul-terminated string containing
+** the string representation of a Tcl data-structure reflecting the
+** current structure of the database file. Specifically, the current state
+** of the worker snapshot. The returned string should be eventually freed
+** by the caller using lsm_free().
+**
+** The returned list contains one element for each level in the database,
+** in order from most to least recent. Each element contains a
+** single element for each segment comprising the corresponding level,
+** starting with the lhs segment, then each of the rhs segments (if any)
+** in order from most to least recent.
+**
+** Each segment element is itself a list of 4 integer values, as follows:
+**
+**
First page of segment
+**
Last page of segment
+**
Root page of segment (if applicable)
+**
Total number of pages in segment
+**
+**
+** LSM_INFO_ARRAY_STRUCTURE:
+** There should be two arguments passed following this option (i.e. a
+** total of four arguments passed to lsm_info()). The first argument
+** should be the page number of the first page in a database array
+** (perhaps obtained from an earlier INFO_DB_STRUCTURE call). The second
+** trailing argument should be of type (char **). The location pointed
+** to is populated with a pointer to a nul-terminated string that must
+** be eventually freed using lsm_free() by the caller.
+**
+** The output string contains the text representation of a Tcl list of
+** integers. Each pair of integers represent a range of pages used by
+** the identified array. For example, if the array occupies database
+** pages 993 to 1024, then pages 2048 to 2777, then the returned string
+** will be "993 1024 2048 2777".
+**
+** If the specified integer argument does not correspond to the first
+** page of any database array, LSM_ERROR is returned and the output
+** pointer is set to a NULL value.
+**
+** LSM_INFO_LOG_STRUCTURE:
+** The third argument should be of type (char **). The location pointed
+** to is populated with a pointer to a nul-terminated string containing
+** the string representation of a Tcl data-structure. The returned
+** string should be eventually freed by the caller using lsm_free().
+**
+** The Tcl structure returned is a list of six integers that describe
+** the current structure of the log file.
+**
+** LSM_INFO_ARRAY_PAGES:
+**
+** LSM_INFO_PAGE_ASCII_DUMP:
+** As with LSM_INFO_ARRAY_STRUCTURE, there should be two arguments passed
+** with calls that specify this option - an integer page number and a
+** (char **) used to return a nul-terminated string that must be later
+** freed using lsm_free(). In this case the output string is populated
+** with a human-readable description of the page content.
+**
+** If the page cannot be decoded, it is not an error. In this case the
+** human-readable output message will report the systems failure to
+** interpret the page data.
+**
+** LSM_INFO_PAGE_HEX_DUMP:
+** This argument is similar to PAGE_ASCII_DUMP, except that keys and
+** values are represented using hexadecimal notation instead of ascii.
+**
+** LSM_INFO_FREELIST:
+** The third argument should be of type (char **). The location pointed
+** to is populated with a pointer to a nul-terminated string containing
+** the string representation of a Tcl data-structure. The returned
+** string should be eventually freed by the caller using lsm_free().
+**
+** The Tcl structure returned is a list containing one element for each
+** free block in the database. The element itself consists of two
+** integers - the block number and the id of the snapshot that freed it.
+**
+** LSM_INFO_CHECKPOINT_SIZE:
+** The third argument should be of type (int *). The location pointed to
+** by this argument is populated with the number of KB written to the
+** database file since the most recent checkpoint.
+**
+** LSM_INFO_TREE_SIZE:
+** If this value is passed as the second argument to an lsm_info() call, it
+** should be followed by two arguments of type (int *) (for a total of four
+** arguments).
+**
+** At any time, there are either one or two tree structures held in shared
+** memory that new database clients will access (there may also be additional
+** tree structures being used by older clients - this API does not provide
+** information on them). One tree structure - the current tree - is used to
+** accumulate new data written to the database. The other tree structure -
+** the old tree - is a read-only tree holding older data and may be flushed
+** to disk at any time.
+**
+** Assuming no error occurs, the location pointed to by the first of the two
+** (int *) arguments is set to the size of the old in-memory tree in KB.
+** The second is set to the size of the current, or live in-memory tree.
+**
+** LSM_INFO_COMPRESSION_ID:
+** This value should be followed by a single argument of type
+** (unsigned int *). If successful, the location pointed to is populated
+** with the database compression id before returning.
+*/
+#define LSM_INFO_NWRITE 1
+#define LSM_INFO_NREAD 2
+#define LSM_INFO_DB_STRUCTURE 3
+#define LSM_INFO_LOG_STRUCTURE 4
+#define LSM_INFO_ARRAY_STRUCTURE 5
+#define LSM_INFO_PAGE_ASCII_DUMP 6
+#define LSM_INFO_PAGE_HEX_DUMP 7
+#define LSM_INFO_FREELIST 8
+#define LSM_INFO_ARRAY_PAGES 9
+#define LSM_INFO_CHECKPOINT_SIZE 10
+#define LSM_INFO_TREE_SIZE 11
+#define LSM_INFO_FREELIST_SIZE 12
+#define LSM_INFO_COMPRESSION_ID 13
+
+
+/*
+** CAPI: Opening and Closing Write Transactions
+**
+** These functions are used to open and close transactions and nested
+** sub-transactions.
+**
+** The lsm_begin() function is used to open transactions and sub-transactions.
+** A successful call to lsm_begin() ensures that there are at least iLevel
+** nested transactions open. To open a top-level transaction, pass iLevel=1.
+** To open a sub-transaction within the top-level transaction, iLevel=2.
+** Passing iLevel=0 is a no-op.
+**
+** lsm_commit() is used to commit transactions and sub-transactions. A
+** successful call to lsm_commit() ensures that there are at most iLevel
+** nested transactions open. To commit a top-level transaction, pass iLevel=0.
+** To commit all sub-transactions inside the main transaction, pass iLevel=1.
+**
+** Function lsm_rollback() is used to roll back transactions and
+** sub-transactions. A successful call to lsm_rollback() restores the database
+** to the state it was in when the iLevel'th nested sub-transaction (if any)
+** was first opened. And then closes transactions to ensure that there are
+** at most iLevel nested transactions open. Passing iLevel=0 rolls back and
+** closes the top-level transaction. iLevel=1 also rolls back the top-level
+** transaction, but leaves it open. iLevel=2 rolls back the sub-transaction
+** nested directly inside the top-level transaction (and leaves it open).
+*/
+int lsm_begin(lsm_db *pDb, int iLevel);
+int lsm_commit(lsm_db *pDb, int iLevel);
+int lsm_rollback(lsm_db *pDb, int iLevel);
+
+/*
+** CAPI: Writing to a Database
+**
+** Write a new value into the database. If a value with a duplicate key
+** already exists it is replaced.
+*/
+int lsm_insert(lsm_db*, const void *pKey, int nKey, const void *pVal, int nVal);
+
+/*
+** Delete a value from the database. No error is returned if the specified
+** key value does not exist in the database.
+*/
+int lsm_delete(lsm_db *, const void *pKey, int nKey);
+
+/*
+** Delete all database entries with keys that are greater than (pKey1/nKey1)
+** and smaller than (pKey2/nKey2). Note that keys (pKey1/nKey1) and
+** (pKey2/nKey2) themselves, if they exist in the database, are not deleted.
+**
+** Return LSM_OK if successful, or an LSM error code otherwise.
+*/
+int lsm_delete_range(lsm_db *,
+ const void *pKey1, int nKey1, const void *pKey2, int nKey2
+);
+
+/*
+** CAPI: Explicit Database Work and Checkpointing
+**
+** This function is called by a thread to work on the database structure.
+*/
+int lsm_work(lsm_db *pDb, int nMerge, int nKB, int *pnWrite);
+
+int lsm_flush(lsm_db *pDb);
+
+/*
+** Attempt to checkpoint the current database snapshot. Return an LSM
+** error code if an error occurs or LSM_OK otherwise.
+**
+** If the current snapshot has already been checkpointed, calling this
+** function is a no-op. In this case if pnKB is not NULL, *pnKB is
+** set to 0. Or, if the current snapshot is successfully checkpointed
+** by this function and pbKB is not NULL, *pnKB is set to the number
+** of bytes written to the database file since the previous checkpoint
+** (the same measure as returned by the LSM_INFO_CHECKPOINT_SIZE query).
+*/
+int lsm_checkpoint(lsm_db *pDb, int *pnKB);
+
+/*
+** CAPI: Opening and Closing Database Cursors
+**
+** Open and close a database cursor.
+*/
+int lsm_csr_open(lsm_db *pDb, lsm_cursor **ppCsr);
+int lsm_csr_close(lsm_cursor *pCsr);
+
+/*
+** CAPI: Positioning Database Cursors
+**
+** If the fourth parameter is LSM_SEEK_EQ, LSM_SEEK_GE or LSM_SEEK_LE,
+** this function searches the database for an entry with key (pKey/nKey).
+** If an error occurs, an LSM error code is returned. Otherwise, LSM_OK.
+**
+** If no error occurs and the requested key is present in the database, the
+** cursor is left pointing to the entry with the specified key. Or, if the
+** specified key is not present in the database the state of the cursor
+** depends on the value passed as the final parameter, as follows:
+**
+** LSM_SEEK_EQ:
+** The cursor is left at EOF (invalidated). A call to lsm_csr_valid()
+** returns non-zero.
+**
+** LSM_SEEK_LE:
+** The cursor is left pointing to the largest key in the database that
+** is smaller than (pKey/nKey). If the database contains no keys smaller
+** than (pKey/nKey), the cursor is left at EOF.
+**
+** LSM_SEEK_GE:
+** The cursor is left pointing to the smallest key in the database that
+** is larger than (pKey/nKey). If the database contains no keys larger
+** than (pKey/nKey), the cursor is left at EOF.
+**
+** If the fourth parameter is LSM_SEEK_LEFAST, this function searches the
+** database in a similar manner to LSM_SEEK_LE, with two differences:
+**
+**
Even if a key can be found (the cursor is not left at EOF), the
+** lsm_csr_value() function may not be used (attempts to do so return
+** LSM_MISUSE).
+**
+**
The key that the cursor is left pointing to may be one that has
+** been recently deleted from the database. In this case it is
+** guaranteed that the returned key is larger than any key currently
+** in the database that is less than or equal to (pKey/nKey).
+**
+**
+** LSM_SEEK_LEFAST requests are intended to be used to allocate database
+** keys.
+*/
+int lsm_csr_seek(lsm_cursor *pCsr, const void *pKey, int nKey, int eSeek);
+
+int lsm_csr_first(lsm_cursor *pCsr);
+int lsm_csr_last(lsm_cursor *pCsr);
+
+/*
+** Advance the specified cursor to the next or previous key in the database.
+** Return LSM_OK if successful, or an LSM error code otherwise.
+**
+** Functions lsm_csr_seek(), lsm_csr_first() and lsm_csr_last() are "seek"
+** functions. Whether or not lsm_csr_next and lsm_csr_prev may be called
+** successfully also depends on the most recent seek function called on
+** the cursor. Specifically:
+**
+**
+**
At least one seek function must have been called on the cursor.
+**
To call lsm_csr_next(), the most recent call to a seek function must
+** have been either lsm_csr_first() or a call to lsm_csr_seek() specifying
+** LSM_SEEK_GE.
+**
To call lsm_csr_prev(), the most recent call to a seek function must
+** have been either lsm_csr_last() or a call to lsm_csr_seek() specifying
+** LSM_SEEK_LE.
+**
+**
+** Otherwise, if the above conditions are not met when lsm_csr_next or
+** lsm_csr_prev is called, LSM_MISUSE is returned and the cursor position
+** remains unchanged.
+*/
+int lsm_csr_next(lsm_cursor *pCsr);
+int lsm_csr_prev(lsm_cursor *pCsr);
+
+/*
+** Values that may be passed as the fourth argument to lsm_csr_seek().
+*/
+#define LSM_SEEK_LEFAST -2
+#define LSM_SEEK_LE -1
+#define LSM_SEEK_EQ 0
+#define LSM_SEEK_GE 1
+
+/*
+** CAPI: Extracting Data From Database Cursors
+**
+** Retrieve data from a database cursor.
+*/
+int lsm_csr_valid(lsm_cursor *pCsr);
+int lsm_csr_key(lsm_cursor *pCsr, const void **ppKey, int *pnKey);
+int lsm_csr_value(lsm_cursor *pCsr, const void **ppVal, int *pnVal);
+
+/*
+** If no error occurs, this function compares the database key passed via
+** the pKey/nKey arguments with the key that the cursor passed as the first
+** argument currently points to. If the cursors key is less than, equal to
+** or greater than pKey/nKey, *piRes is set to less than, equal to or greater
+** than zero before returning. LSM_OK is returned in this case.
+**
+** Or, if an error occurs, an LSM error code is returned and the final
+** value of *piRes is undefined. If the cursor does not point to a valid
+** key when this function is called, LSM_MISUSE is returned.
+*/
+int lsm_csr_cmp(lsm_cursor *pCsr, const void *pKey, int nKey, int *piRes);
+
+/*
+** CAPI: Change these!!
+**
+** Configure a callback to which debugging and other messages should
+** be directed. Only useful for debugging lsm.
+*/
+void lsm_config_log(lsm_db *, void (*)(void *, int, const char *), void *);
+
+/*
+** Configure a callback that is invoked if the database connection ever
+** writes to the database file.
+*/
+void lsm_config_work_hook(lsm_db *, void (*)(lsm_db *, void *), void *);
+
+/* ENDOFAPI */
+#ifdef __cplusplus
+} /* End of the 'extern "C"' block */
+#endif
+#endif /* ifndef _LSM_H */
diff --git a/ext/lsm1/lsmInt.h b/ext/lsm1/lsmInt.h
new file mode 100644
index 0000000..b346d8d
--- /dev/null
+++ b/ext/lsm1/lsmInt.h
@@ -0,0 +1,993 @@
+/*
+** 2011-08-18
+**
+** The author disclaims copyright to this source code. In place of
+** a legal notice, here is a blessing:
+**
+** May you do good and not evil.
+** May you find forgiveness for yourself and forgive others.
+** May you share freely, never taking more than you give.
+**
+*************************************************************************
+** Internal structure definitions for the LSM module.
+*/
+#ifndef _LSM_INT_H
+#define _LSM_INT_H
+
+#include "lsm.h"
+#include
+#include
+
+#include
+#include
+#include
+#include
+
+#ifdef _WIN32
+# ifdef _MSC_VER
+# define snprintf _snprintf
+# endif
+#else
+# include
+#endif
+
+#ifdef NDEBUG
+# ifdef LSM_DEBUG_EXPENSIVE
+# undef LSM_DEBUG_EXPENSIVE
+# endif
+# ifdef LSM_DEBUG
+# undef LSM_DEBUG
+# endif
+#else
+# ifndef LSM_DEBUG
+# define LSM_DEBUG
+# endif
+#endif
+
+/*
+** Default values for various data structure parameters. These may be
+** overridden by calls to lsm_config().
+*/
+#define LSM_DFLT_PAGE_SIZE (4 * 1024)
+#define LSM_DFLT_BLOCK_SIZE (1 * 1024 * 1024)
+#define LSM_DFLT_AUTOFLUSH (1 * 1024 * 1024)
+#define LSM_DFLT_AUTOCHECKPOINT (i64)(2 * 1024 * 1024)
+#define LSM_DFLT_AUTOWORK 1
+#define LSM_DFLT_LOG_SIZE (128*1024)
+#define LSM_DFLT_AUTOMERGE 4
+#define LSM_DFLT_SAFETY LSM_SAFETY_NORMAL
+#define LSM_DFLT_MMAP (LSM_IS_64_BIT ? 1 : 32768)
+#define LSM_DFLT_MULTIPLE_PROCESSES 1
+#define LSM_DFLT_USE_LOG 1
+
+/* Initial values for log file checksums. These are only used if the
+** database file does not contain a valid checkpoint. */
+#define LSM_CKSUM0_INIT 42
+#define LSM_CKSUM1_INIT 42
+
+/* "mmap" mode is currently only used in environments with 64-bit address
+** spaces. The following macro is used to test for this. */
+#define LSM_IS_64_BIT (sizeof(void*)==8)
+
+#define LSM_AUTOWORK_QUANT 32
+
+typedef struct Database Database;
+typedef struct DbLog DbLog;
+typedef struct FileSystem FileSystem;
+typedef struct Freelist Freelist;
+typedef struct FreelistEntry FreelistEntry;
+typedef struct Level Level;
+typedef struct LogMark LogMark;
+typedef struct LogRegion LogRegion;
+typedef struct LogWriter LogWriter;
+typedef struct LsmString LsmString;
+typedef struct Mempool Mempool;
+typedef struct Merge Merge;
+typedef struct MergeInput MergeInput;
+typedef struct MetaPage MetaPage;
+typedef struct MultiCursor MultiCursor;
+typedef struct Page Page;
+typedef struct Redirect Redirect;
+typedef struct Segment Segment;
+typedef struct SegmentMerger SegmentMerger;
+typedef struct ShmChunk ShmChunk;
+typedef struct ShmHeader ShmHeader;
+typedef struct ShmReader ShmReader;
+typedef struct Snapshot Snapshot;
+typedef struct TransMark TransMark;
+typedef struct Tree Tree;
+typedef struct TreeCursor TreeCursor;
+typedef struct TreeHeader TreeHeader;
+typedef struct TreeMark TreeMark;
+typedef struct TreeRoot TreeRoot;
+
+#ifndef _SQLITEINT_H_
+typedef unsigned char u8;
+typedef unsigned short int u16;
+typedef unsigned int u32;
+typedef lsm_i64 i64;
+typedef unsigned long long int u64;
+#endif
+
+/* A page number is a 64-bit integer. */
+typedef i64 Pgno;
+
+#ifdef LSM_DEBUG
+int lsmErrorBkpt(int);
+#else
+# define lsmErrorBkpt(x) (x)
+#endif
+
+#define LSM_PROTOCOL_BKPT lsmErrorBkpt(LSM_PROTOCOL)
+#define LSM_IOERR_BKPT lsmErrorBkpt(LSM_IOERR)
+#define LSM_NOMEM_BKPT lsmErrorBkpt(LSM_NOMEM)
+#define LSM_CORRUPT_BKPT lsmErrorBkpt(LSM_CORRUPT)
+#define LSM_MISUSE_BKPT lsmErrorBkpt(LSM_MISUSE)
+
+#define unused_parameter(x) (void)(x)
+#define array_size(x) (sizeof(x)/sizeof(x[0]))
+
+
+/* The size of each shared-memory chunk */
+#define LSM_SHM_CHUNK_SIZE (32*1024)
+
+/* The number of bytes reserved at the start of each shm chunk for MM. */
+#define LSM_SHM_CHUNK_HDR (sizeof(ShmChunk))
+
+/* The number of available read locks. */
+#define LSM_LOCK_NREADER 6
+
+/* The number of available read-write client locks. */
+#define LSM_LOCK_NRWCLIENT 16
+
+/* Lock definitions.
+*/
+#define LSM_LOCK_DMS1 1 /* Serialize connect/disconnect ops */
+#define LSM_LOCK_DMS2 2 /* Read-write connections */
+#define LSM_LOCK_DMS3 3 /* Read-only connections */
+#define LSM_LOCK_WRITER 4
+#define LSM_LOCK_WORKER 5
+#define LSM_LOCK_CHECKPOINTER 6
+#define LSM_LOCK_ROTRANS 7
+#define LSM_LOCK_READER(i) ((i) + LSM_LOCK_ROTRANS + 1)
+#define LSM_LOCK_RWCLIENT(i) ((i) + LSM_LOCK_READER(LSM_LOCK_NREADER))
+
+#define LSM_N_LOCK LSM_LOCK_RWCLIENT(LSM_LOCK_NRWCLIENT)
+
+/*
+** Meta-page size and usable size.
+*/
+#define LSM_META_PAGE_SIZE 4096
+
+#define LSM_META_RW_PAGE_SIZE (LSM_META_PAGE_SIZE - LSM_N_LOCK)
+
+/*
+** Hard limit on the number of free-list entries that may be stored in
+** a checkpoint (the remainder are stored as a system record in the LSM).
+** See also LSM_CONFIG_MAX_FREELIST.
+*/
+#define LSM_MAX_FREELIST_ENTRIES 24
+
+#define LSM_MAX_BLOCK_REDIRECTS 16
+
+#define LSM_ATTEMPTS_BEFORE_PROTOCOL 10000
+
+
+/*
+** Each entry stored in the LSM (or in-memory tree structure) has an
+** associated mask of the following flags.
+*/
+#define LSM_START_DELETE 0x01 /* Start of open-ended delete range */
+#define LSM_END_DELETE 0x02 /* End of open-ended delete range */
+#define LSM_POINT_DELETE 0x04 /* Delete this key */
+#define LSM_INSERT 0x08 /* Insert this key and value */
+#define LSM_SEPARATOR 0x10 /* True if entry is separator key only */
+#define LSM_SYSTEMKEY 0x20 /* True if entry is a system key (FREELIST) */
+
+#define LSM_CONTIGUOUS 0x40 /* Used in lsm_tree.c */
+
+/*
+** A string that can grow by appending.
+*/
+struct LsmString {
+ lsm_env *pEnv; /* Run-time environment */
+ int n; /* Size of string. -1 indicates error */
+ int nAlloc; /* Space allocated for z[] */
+ char *z; /* The string content */
+};
+
+typedef struct LsmFile LsmFile;
+struct LsmFile {
+ lsm_file *pFile;
+ LsmFile *pNext;
+};
+
+/*
+** An instance of the following type is used to store an ordered list of
+** u32 values.
+**
+** Note: This is a place-holder implementation. It should be replaced by
+** a version that avoids making a single large allocation when the array
+** contains a large number of values. For this reason, the internals of
+** this object should only manipulated by the intArrayXXX() functions in
+** lsm_tree.c.
+*/
+typedef struct IntArray IntArray;
+struct IntArray {
+ int nAlloc;
+ int nArray;
+ u32 *aArray;
+};
+
+struct Redirect {
+ int n; /* Number of redirects */
+ struct RedirectEntry {
+ int iFrom;
+ int iTo;
+ } *a;
+};
+
+/*
+** An instance of this structure represents a point in the history of the
+** tree structure to roll back to. Refer to comments in lsm_tree.c for
+** details.
+*/
+struct TreeMark {
+ u32 iRoot; /* Offset of root node in shm file */
+ u32 nHeight; /* Current height of tree structure */
+ u32 iWrite; /* Write offset in shm file */
+ u32 nChunk; /* Number of chunks in shared-memory file */
+ u32 iFirst; /* First chunk in linked list */
+ u32 iNextShmid; /* Next id to allocate */
+ int iRollback; /* Index in lsm->rollback to revert to */
+};
+
+/*
+** An instance of this structure represents a point in the database log.
+*/
+struct LogMark {
+ i64 iOff; /* Offset into log (see lsm_log.c) */
+ int nBuf; /* Size of in-memory buffer here */
+ u8 aBuf[8]; /* Bytes of content in aBuf[] */
+ u32 cksum0; /* Checksum 0 at offset (iOff-nBuf) */
+ u32 cksum1; /* Checksum 1 at offset (iOff-nBuf) */
+};
+
+struct TransMark {
+ TreeMark tree;
+ LogMark log;
+};
+
+/*
+** A structure that defines the start and end offsets of a region in the
+** log file. The size of the region in bytes is (iEnd - iStart), so if
+** iEnd==iStart the region is zero bytes in size.
+*/
+struct LogRegion {
+ i64 iStart; /* Start of region in log file */
+ i64 iEnd; /* End of region in log file */
+};
+
+struct DbLog {
+ u32 cksum0; /* Checksum 0 at offset iOff */
+ u32 cksum1; /* Checksum 1 at offset iOff */
+ i64 iSnapshotId; /* Log space has been reclaimed to this ss */
+ LogRegion aRegion[3]; /* Log file regions (see docs in lsm_log.c) */
+};
+
+struct TreeRoot {
+ u32 iRoot;
+ u32 nHeight;
+ u32 nByte; /* Total size of this tree in bytes */
+ u32 iTransId;
+};
+
+/*
+** Tree header structure.
+*/
+struct TreeHeader {
+ u32 iUsedShmid; /* Id of first shm chunk used by this tree */
+ u32 iNextShmid; /* Shm-id of next chunk allocated */
+ u32 iFirst; /* Chunk number of smallest shm-id */
+ u32 nChunk; /* Number of chunks in shared-memory file */
+ TreeRoot root; /* Root and height of current tree */
+ u32 iWrite; /* Write offset in shm file */
+ TreeRoot oldroot; /* Root and height of the previous tree */
+ u32 iOldShmid; /* Last shm-id used by previous tree */
+ u32 iUsrVersion; /* get/set_user_version() value */
+ i64 iOldLog; /* Log offset associated with old tree */
+ u32 oldcksum0;
+ u32 oldcksum1;
+ DbLog log; /* Current layout of log file */
+ u32 aCksum[2]; /* Checksums 1 and 2. */
+};
+
+/*
+** Database handle structure.
+**
+** mLock:
+** A bitmask representing the locks currently held by the connection.
+** An LSM database supports N distinct locks, where N is some number less
+** than or equal to 32. Locks are numbered starting from 1 (see the
+** definitions for LSM_LOCK_WRITER and co.).
+**
+** The least significant 32-bits in mLock represent EXCLUSIVE locks. The
+** most significant are SHARED locks. So, if a connection holds a SHARED
+** lock on lock region iLock, then the following is true:
+**
+** (mLock & ((iLock+32-1) << 1))
+**
+** Or for an EXCLUSIVE lock:
+**
+** (mLock & ((iLock-1) << 1))
+**
+** pCsr:
+** Points to the head of a linked list that contains all currently open
+** cursors. Once this list becomes empty, the user has no outstanding
+** cursors and the database handle can be successfully closed.
+**
+** pCsrCache:
+** This list contains cursor objects that have been closed using
+** lsm_csr_close(). Each time a cursor is closed, it is shifted from
+** the pCsr list to this list. When a new cursor is opened, this list
+** is inspected to see if there exists a cursor object that can be
+** reused. This is an optimization only.
+*/
+struct lsm_db {
+
+ /* Database handle configuration */
+ lsm_env *pEnv; /* runtime environment */
+ int (*xCmp)(void *, int, void *, int); /* Compare function */
+
+ /* Values configured by calls to lsm_config */
+ int eSafety; /* LSM_SAFETY_OFF, NORMAL or FULL */
+ int bAutowork; /* Configured by LSM_CONFIG_AUTOWORK */
+ int nTreeLimit; /* Configured by LSM_CONFIG_AUTOFLUSH */
+ int nMerge; /* Configured by LSM_CONFIG_AUTOMERGE */
+ int bUseLog; /* Configured by LSM_CONFIG_USE_LOG */
+ int nDfltPgsz; /* Configured by LSM_CONFIG_PAGE_SIZE */
+ int nDfltBlksz; /* Configured by LSM_CONFIG_BLOCK_SIZE */
+ int nMaxFreelist; /* Configured by LSM_CONFIG_MAX_FREELIST */
+ int iMmap; /* Configured by LSM_CONFIG_MMAP */
+ i64 nAutockpt; /* Configured by LSM_CONFIG_AUTOCHECKPOINT */
+ int bMultiProc; /* Configured by L_C_MULTIPLE_PROCESSES */
+ int bReadonly; /* Configured by LSM_CONFIG_READONLY */
+ lsm_compress compress; /* Compression callbacks */
+ lsm_compress_factory factory; /* Compression callback factory */
+
+ /* Sub-system handles */
+ FileSystem *pFS; /* On-disk portion of database */
+ Database *pDatabase; /* Database shared data */
+
+ int iRwclient; /* Read-write client lock held (-1 == none) */
+
+ /* Client transaction context */
+ Snapshot *pClient; /* Client snapshot */
+ int iReader; /* Read lock held (-1 == unlocked) */
+ int bRoTrans; /* True if a read-only db trans is open */
+ MultiCursor *pCsr; /* List of all open cursors */
+ LogWriter *pLogWriter; /* Context for writing to the log file */
+ int nTransOpen; /* Number of opened write transactions */
+ int nTransAlloc; /* Allocated size of aTrans[] array */
+ TransMark *aTrans; /* Array of marks for transaction rollback */
+ IntArray rollback; /* List of tree-nodes to roll back */
+ int bDiscardOld; /* True if lsmTreeDiscardOld() was called */
+
+ MultiCursor *pCsrCache; /* List of all closed cursors */
+
+ /* Worker context */
+ Snapshot *pWorker; /* Worker snapshot (or NULL) */
+ Freelist *pFreelist; /* See sortedNewToplevel() */
+ int bUseFreelist; /* True to use pFreelist */
+ int bIncrMerge; /* True if currently doing a merge */
+
+ int bInFactory; /* True if within factory.xFactory() */
+
+ /* Debugging message callback */
+ void (*xLog)(void *, int, const char *);
+ void *pLogCtx;
+
+ /* Work done notification callback */
+ void (*xWork)(lsm_db *, void *);
+ void *pWorkCtx;
+
+ u64 mLock; /* Mask of current locks. See lsmShmLock(). */
+ lsm_db *pNext; /* Next connection to same database */
+
+ int nShm; /* Size of apShm[] array */
+ void **apShm; /* Shared memory chunks */
+ ShmHeader *pShmhdr; /* Live shared-memory header */
+ TreeHeader treehdr; /* Local copy of tree-header */
+ u32 aSnapshot[LSM_META_PAGE_SIZE / sizeof(u32)];
+};
+
+struct Segment {
+ Pgno iFirst; /* First page of this run */
+ Pgno iLastPg; /* Last page of this run */
+ Pgno iRoot; /* Root page number (if any) */
+ int nSize; /* Size of this run in pages */
+
+ Redirect *pRedirect; /* Block redirects (or NULL) */
+};
+
+/*
+** iSplitTopic/pSplitKey/nSplitKey:
+** If nRight>0, this buffer contains a copy of the largest key that has
+** already been written to the left-hand-side of the level.
+*/
+struct Level {
+ Segment lhs; /* Left-hand (main) segment */
+ int nRight; /* Size of apRight[] array */
+ Segment *aRhs; /* Old segments being merged into this */
+ int iSplitTopic; /* Split key topic (if nRight>0) */
+ void *pSplitKey; /* Pointer to split-key (if nRight>0) */
+ int nSplitKey; /* Number of bytes in split-key */
+
+ u16 iAge; /* Number of times data has been written */
+ u16 flags; /* Mask of LEVEL_XXX bits */
+ Merge *pMerge; /* Merge operation currently underway */
+ Level *pNext; /* Next level in tree */
+};
+
+/*
+** The Level.flags field is set to a combination of the following bits.
+**
+** LEVEL_FREELIST_ONLY:
+** Set if the level consists entirely of free-list entries.
+**
+** LEVEL_INCOMPLETE:
+** This is set while a new toplevel level is being constructed. It is
+** never set for any level other than a new toplevel.
+*/
+#define LEVEL_FREELIST_ONLY 0x0001
+#define LEVEL_INCOMPLETE 0x0002
+
+
+/*
+** A structure describing an ongoing merge. There is an instance of this
+** structure for every Level currently undergoing a merge in the worker
+** snapshot.
+**
+** It is assumed that code that uses an instance of this structure has
+** access to the associated Level struct.
+**
+** iOutputOff:
+** The byte offset to write to next within the last page of the
+** output segment.
+*/
+struct MergeInput {
+ Pgno iPg; /* Page on which next input is stored */
+ int iCell; /* Cell containing next input to merge */
+};
+struct Merge {
+ int nInput; /* Number of input runs being merged */
+ MergeInput *aInput; /* Array nInput entries in size */
+ MergeInput splitkey; /* Location in file of current splitkey */
+ int nSkip; /* Number of separators entries to skip */
+ int iOutputOff; /* Write offset on output page */
+ Pgno iCurrentPtr; /* Current pointer value */
+};
+
+/*
+** The first argument to this macro is a pointer to a Segment structure.
+** Returns true if the structure instance indicates that the separators
+** array is valid.
+*/
+#define segmentHasSeparators(pSegment) ((pSegment)->sep.iFirst>0)
+
+/*
+** The values that accompany the lock held by a database reader.
+*/
+struct ShmReader {
+ u32 iTreeId;
+ i64 iLsmId;
+};
+
+/*
+** An instance of this structure is stored in the first shared-memory
+** page. The shared-memory header.
+**
+** bWriter:
+** Immediately after opening a write transaction taking the WRITER lock,
+** each writer client sets this flag. It is cleared right before the
+** WRITER lock is relinquished. If a subsequent writer finds that this
+** flag is already set when a write transaction is opened, this indicates
+** that a previous writer failed mid-transaction.
+**
+** iMetaPage:
+** If the database file does not contain a valid, synced, checkpoint, this
+** value is set to 0. Otherwise, it is set to the meta-page number that
+** contains the most recently written checkpoint (either 1 or 2).
+**
+** hdr1, hdr2:
+** The two copies of the in-memory tree header. Two copies are required
+** in case a writer fails while updating one of them.
+*/
+struct ShmHeader {
+ u32 aSnap1[LSM_META_PAGE_SIZE / 4];
+ u32 aSnap2[LSM_META_PAGE_SIZE / 4];
+ u32 bWriter;
+ u32 iMetaPage;
+ TreeHeader hdr1;
+ TreeHeader hdr2;
+ ShmReader aReader[LSM_LOCK_NREADER];
+};
+
+/*
+** An instance of this structure is stored at the start of each shared-memory
+** chunk except the first (which is the header chunk - see above).
+*/
+struct ShmChunk {
+ u32 iShmid;
+ u32 iNext;
+};
+
+/*
+** Maximum number of shared-memory chunks allowed in the *-shm file. Since
+** each shared-memory chunk is 32KB in size, this is a theoretical limit only.
+*/
+#define LSM_MAX_SHMCHUNKS (1<<30)
+
+/* Return true if shm-sequence "a" is larger than or equal to "b" */
+#define shm_sequence_ge(a, b) (((u32)a-(u32)b) < LSM_MAX_SHMCHUNKS)
+
+#define LSM_APPLIST_SZ 4
+
+/*
+** An instance of the following structure stores the in-memory part of
+** the current free block list. This structure is to the free block list
+** as the in-memory tree is to the users database content. The contents
+** of the free block list is found by merging the in-memory components
+** with those stored in the LSM, just as the contents of the database is
+** found by merging the in-memory tree with the user data entries in the
+** LSM.
+**
+** Each FreelistEntry structure in the array represents either an insert
+** or delete operation on the free-list. For deletes, the FreelistEntry.iId
+** field is set to -1. For inserts, it is set to zero or greater.
+**
+** The array of FreelistEntry structures is always sorted in order of
+** block number (ascending).
+**
+** When the in-memory free block list is written into the LSM, each insert
+** operation is written separately. The entry key is the bitwise inverse
+** of the block number as a 32-bit big-endian integer. This is done so that
+** the entries in the LSM are sorted in descending order of block id.
+** The associated value is the snapshot id, formated as a varint.
+*/
+struct Freelist {
+ FreelistEntry *aEntry; /* Free list entries */
+ int nEntry; /* Number of valid slots in aEntry[] */
+ int nAlloc; /* Allocated size of aEntry[] */
+};
+struct FreelistEntry {
+ u32 iBlk; /* Block number */
+ i64 iId; /* Largest snapshot id to use this block */
+};
+
+/*
+** A snapshot of a database. A snapshot contains all the information required
+** to read or write a database file on disk. See the description of struct
+** Database below for futher details.
+*/
+struct Snapshot {
+ Database *pDatabase; /* Database this snapshot belongs to */
+ u32 iCmpId; /* Id of compression scheme */
+ Level *pLevel; /* Pointer to level 0 of snapshot (or NULL) */
+ i64 iId; /* Snapshot id */
+ i64 iLogOff; /* Log file offset */
+ Redirect redirect; /* Block redirection array */
+
+ /* Used by worker snapshots only */
+ int nBlock; /* Number of blocks in database file */
+ Pgno aiAppend[LSM_APPLIST_SZ]; /* Append point list */
+ Freelist freelist; /* Free block list */
+ u32 nWrite; /* Total number of pages written to disk */
+};
+#define LSM_INITIAL_SNAPSHOT_ID 11
+
+/*
+** Functions from file "lsm_ckpt.c".
+*/
+int lsmCheckpointWrite(lsm_db *, u32 *);
+int lsmCheckpointLevels(lsm_db *, int, void **, int *);
+int lsmCheckpointLoadLevels(lsm_db *pDb, void *pVal, int nVal);
+
+int lsmCheckpointRecover(lsm_db *);
+int lsmCheckpointDeserialize(lsm_db *, int, u32 *, Snapshot **);
+
+int lsmCheckpointLoadWorker(lsm_db *pDb);
+int lsmCheckpointStore(lsm_db *pDb, int);
+
+int lsmCheckpointLoad(lsm_db *pDb, int *);
+int lsmCheckpointLoadOk(lsm_db *pDb, int);
+int lsmCheckpointClientCacheOk(lsm_db *);
+
+u32 lsmCheckpointNBlock(u32 *);
+i64 lsmCheckpointId(u32 *, int);
+u32 lsmCheckpointNWrite(u32 *, int);
+i64 lsmCheckpointLogOffset(u32 *);
+int lsmCheckpointPgsz(u32 *);
+int lsmCheckpointBlksz(u32 *);
+void lsmCheckpointLogoffset(u32 *aCkpt, DbLog *pLog);
+void lsmCheckpointZeroLogoffset(lsm_db *);
+
+int lsmCheckpointSaveWorker(lsm_db *pDb, int);
+int lsmDatabaseFull(lsm_db *pDb);
+int lsmCheckpointSynced(lsm_db *pDb, i64 *piId, i64 *piLog, u32 *pnWrite);
+
+int lsmCheckpointSize(lsm_db *db, int *pnByte);
+
+int lsmInfoCompressionId(lsm_db *db, u32 *piCmpId);
+
+/*
+** Functions from file "lsm_tree.c".
+*/
+int lsmTreeNew(lsm_env *, int (*)(void *, int, void *, int), Tree **ppTree);
+void lsmTreeRelease(lsm_env *, Tree *);
+int lsmTreeInit(lsm_db *);
+int lsmTreeRepair(lsm_db *);
+
+void lsmTreeMakeOld(lsm_db *pDb);
+void lsmTreeDiscardOld(lsm_db *pDb);
+int lsmTreeHasOld(lsm_db *pDb);
+
+int lsmTreeSize(lsm_db *);
+int lsmTreeEndTransaction(lsm_db *pDb, int bCommit);
+int lsmTreeLoadHeader(lsm_db *pDb, int *);
+int lsmTreeLoadHeaderOk(lsm_db *, int);
+
+int lsmTreeInsert(lsm_db *pDb, void *pKey, int nKey, void *pVal, int nVal);
+int lsmTreeDelete(lsm_db *db, void *pKey1, int nKey1, void *pKey2, int nKey2);
+void lsmTreeRollback(lsm_db *pDb, TreeMark *pMark);
+void lsmTreeMark(lsm_db *pDb, TreeMark *pMark);
+
+int lsmTreeCursorNew(lsm_db *pDb, int, TreeCursor **);
+void lsmTreeCursorDestroy(TreeCursor *);
+
+int lsmTreeCursorSeek(TreeCursor *pCsr, void *pKey, int nKey, int *pRes);
+int lsmTreeCursorNext(TreeCursor *pCsr);
+int lsmTreeCursorPrev(TreeCursor *pCsr);
+int lsmTreeCursorEnd(TreeCursor *pCsr, int bLast);
+void lsmTreeCursorReset(TreeCursor *pCsr);
+int lsmTreeCursorKey(TreeCursor *pCsr, int *pFlags, void **ppKey, int *pnKey);
+int lsmTreeCursorFlags(TreeCursor *pCsr);
+int lsmTreeCursorValue(TreeCursor *pCsr, void **ppVal, int *pnVal);
+int lsmTreeCursorValid(TreeCursor *pCsr);
+int lsmTreeCursorSave(TreeCursor *pCsr);
+
+void lsmFlagsToString(int flags, char *zFlags);
+
+/*
+** Functions from file "mem.c".
+*/
+void *lsmMalloc(lsm_env*, size_t);
+void lsmFree(lsm_env*, void *);
+void *lsmRealloc(lsm_env*, void *, size_t);
+void *lsmReallocOrFree(lsm_env*, void *, size_t);
+void *lsmReallocOrFreeRc(lsm_env *, void *, size_t, int *);
+
+void *lsmMallocZeroRc(lsm_env*, size_t, int *);
+void *lsmMallocRc(lsm_env*, size_t, int *);
+
+void *lsmMallocZero(lsm_env *pEnv, size_t);
+char *lsmMallocStrdup(lsm_env *pEnv, const char *);
+
+/*
+** Functions from file "lsm_mutex.c".
+*/
+int lsmMutexStatic(lsm_env*, int, lsm_mutex **);
+int lsmMutexNew(lsm_env*, lsm_mutex **);
+void lsmMutexDel(lsm_env*, lsm_mutex *);
+void lsmMutexEnter(lsm_env*, lsm_mutex *);
+int lsmMutexTry(lsm_env*, lsm_mutex *);
+void lsmMutexLeave(lsm_env*, lsm_mutex *);
+
+#ifndef NDEBUG
+int lsmMutexHeld(lsm_env *, lsm_mutex *);
+int lsmMutexNotHeld(lsm_env *, lsm_mutex *);
+#endif
+
+/**************************************************************************
+** Start of functions from "lsm_file.c".
+*/
+int lsmFsOpen(lsm_db *, const char *, int);
+int lsmFsOpenLog(lsm_db *, int *);
+void lsmFsCloseLog(lsm_db *);
+void lsmFsClose(FileSystem *);
+
+int lsmFsUnmap(FileSystem *);
+
+int lsmFsConfigure(lsm_db *db);
+
+int lsmFsBlockSize(FileSystem *);
+void lsmFsSetBlockSize(FileSystem *, int);
+int lsmFsMoveBlock(FileSystem *pFS, Segment *pSeg, int iTo, int iFrom);
+
+int lsmFsPageSize(FileSystem *);
+void lsmFsSetPageSize(FileSystem *, int);
+
+int lsmFsFileid(lsm_db *pDb, void **ppId, int *pnId);
+
+/* Creating, populating, gobbling and deleting sorted runs. */
+void lsmFsGobble(lsm_db *, Segment *, Pgno *, int);
+int lsmFsSortedDelete(FileSystem *, Snapshot *, int, Segment *);
+int lsmFsSortedFinish(FileSystem *, Segment *);
+int lsmFsSortedAppend(FileSystem *, Snapshot *, Level *, int, Page **);
+int lsmFsSortedPadding(FileSystem *, Snapshot *, Segment *);
+
+/* Functions to retrieve the lsm_env pointer from a FileSystem or Page object */
+lsm_env *lsmFsEnv(FileSystem *);
+lsm_env *lsmPageEnv(Page *);
+FileSystem *lsmPageFS(Page *);
+
+int lsmFsSectorSize(FileSystem *);
+
+void lsmSortedSplitkey(lsm_db *, Level *, int *);
+
+/* Reading sorted run content. */
+int lsmFsDbPageLast(FileSystem *pFS, Segment *pSeg, Page **ppPg);
+int lsmFsDbPageGet(FileSystem *, Segment *, Pgno, Page **);
+int lsmFsDbPageNext(Segment *, Page *, int eDir, Page **);
+
+u8 *lsmFsPageData(Page *, int *);
+int lsmFsPageRelease(Page *);
+int lsmFsPagePersist(Page *);
+void lsmFsPageRef(Page *);
+Pgno lsmFsPageNumber(Page *);
+
+int lsmFsNRead(FileSystem *);
+int lsmFsNWrite(FileSystem *);
+
+int lsmFsMetaPageGet(FileSystem *, int, int, MetaPage **);
+int lsmFsMetaPageRelease(MetaPage *);
+u8 *lsmFsMetaPageData(MetaPage *, int *);
+
+#ifdef LSM_DEBUG
+int lsmFsDbPageIsLast(Segment *pSeg, Page *pPg);
+int lsmFsIntegrityCheck(lsm_db *);
+#endif
+
+Pgno lsmFsRedirectPage(FileSystem *, Redirect *, Pgno);
+
+int lsmFsPageWritable(Page *);
+
+/* Functions to read, write and sync the log file. */
+int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr);
+int lsmFsSyncLog(FileSystem *pFS);
+int lsmFsReadLog(FileSystem *pFS, i64 iOff, int nRead, LsmString *pStr);
+int lsmFsTruncateLog(FileSystem *pFS, i64 nByte);
+int lsmFsTruncateDb(FileSystem *pFS, i64 nByte);
+int lsmFsCloseAndDeleteLog(FileSystem *pFS);
+
+LsmFile *lsmFsDeferClose(FileSystem *pFS);
+
+/* And to sync the db file */
+int lsmFsSyncDb(FileSystem *, int);
+
+void lsmFsFlushWaiting(FileSystem *, int *);
+
+/* Used by lsm_info(ARRAY_STRUCTURE) and lsm_config(MMAP) */
+int lsmInfoArrayStructure(lsm_db *pDb, int bBlock, Pgno iFirst, char **pzOut);
+int lsmInfoArrayPages(lsm_db *pDb, Pgno iFirst, char **pzOut);
+int lsmConfigMmap(lsm_db *pDb, int *piParam);
+
+int lsmEnvOpen(lsm_env *, const char *, int, lsm_file **);
+int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile);
+int lsmEnvLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int eLock);
+int lsmEnvTestLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int nLock, int);
+
+int lsmEnvShmMap(lsm_env *, lsm_file *, int, int, void **);
+void lsmEnvShmBarrier(lsm_env *);
+void lsmEnvShmUnmap(lsm_env *, lsm_file *, int);
+
+void lsmEnvSleep(lsm_env *, int);
+
+int lsmFsReadSyncedId(lsm_db *db, int, i64 *piVal);
+
+int lsmFsSegmentContainsPg(FileSystem *pFS, Segment *, Pgno, int *);
+
+void lsmFsPurgeCache(FileSystem *);
+
+/*
+** End of functions from "lsm_file.c".
+**************************************************************************/
+
+/*
+** Functions from file "lsm_sorted.c".
+*/
+int lsmInfoPageDump(lsm_db *, Pgno, int, char **);
+void lsmSortedCleanup(lsm_db *);
+int lsmSortedAutoWork(lsm_db *, int nUnit);
+
+int lsmSortedWalkFreelist(lsm_db *, int, int (*)(void *, int, i64), void *);
+
+int lsmSaveWorker(lsm_db *, int);
+
+int lsmFlushTreeToDisk(lsm_db *pDb);
+
+void lsmSortedRemap(lsm_db *pDb);
+
+void lsmSortedFreeLevel(lsm_env *pEnv, Level *);
+
+int lsmSortedAdvanceAll(lsm_db *pDb);
+
+int lsmSortedLoadMerge(lsm_db *, Level *, u32 *, int *);
+int lsmSortedLoadFreelist(lsm_db *pDb, void **, int *);
+
+void *lsmSortedSplitKey(Level *pLevel, int *pnByte);
+
+void lsmSortedSaveTreeCursors(lsm_db *);
+
+int lsmMCursorNew(lsm_db *, MultiCursor **);
+void lsmMCursorClose(MultiCursor *, int);
+int lsmMCursorSeek(MultiCursor *, int, void *, int , int);
+int lsmMCursorFirst(MultiCursor *);
+int lsmMCursorPrev(MultiCursor *);
+int lsmMCursorLast(MultiCursor *);
+int lsmMCursorValid(MultiCursor *);
+int lsmMCursorNext(MultiCursor *);
+int lsmMCursorKey(MultiCursor *, void **, int *);
+int lsmMCursorValue(MultiCursor *, void **, int *);
+int lsmMCursorType(MultiCursor *, int *);
+lsm_db *lsmMCursorDb(MultiCursor *);
+void lsmMCursorFreeCache(lsm_db *);
+
+int lsmSaveCursors(lsm_db *pDb);
+int lsmRestoreCursors(lsm_db *pDb);
+
+void lsmSortedDumpStructure(lsm_db *pDb, Snapshot *, int, int, const char *);
+void lsmFsDumpBlocklists(lsm_db *);
+
+void lsmSortedExpandBtreePage(Page *pPg, int nOrig);
+
+void lsmPutU32(u8 *, u32);
+u32 lsmGetU32(u8 *);
+u64 lsmGetU64(u8 *);
+
+/*
+** Functions from "lsm_varint.c".
+*/
+int lsmVarintPut32(u8 *, int);
+int lsmVarintGet32(u8 *, int *);
+int lsmVarintPut64(u8 *aData, i64 iVal);
+int lsmVarintGet64(const u8 *aData, i64 *piVal);
+
+int lsmVarintLen32(int);
+int lsmVarintSize(u8 c);
+
+/*
+** Functions from file "main.c".
+*/
+void lsmLogMessage(lsm_db *, int, const char *, ...);
+int lsmInfoFreelist(lsm_db *pDb, char **pzOut);
+
+/*
+** Functions from file "lsm_log.c".
+*/
+int lsmLogBegin(lsm_db *pDb);
+int lsmLogWrite(lsm_db *, int, void *, int, void *, int);
+int lsmLogCommit(lsm_db *);
+void lsmLogEnd(lsm_db *pDb, int bCommit);
+void lsmLogTell(lsm_db *, LogMark *);
+void lsmLogSeek(lsm_db *, LogMark *);
+void lsmLogClose(lsm_db *);
+
+int lsmLogRecover(lsm_db *);
+int lsmInfoLogStructure(lsm_db *pDb, char **pzVal);
+
+/* Valid values for the second argument to lsmLogWrite(). */
+#define LSM_WRITE 0x06
+#define LSM_DELETE 0x08
+#define LSM_DRANGE 0x0A
+
+/**************************************************************************
+** Functions from file "lsm_shared.c".
+*/
+
+int lsmDbDatabaseConnect(lsm_db*, const char *);
+void lsmDbDatabaseRelease(lsm_db *);
+
+int lsmBeginReadTrans(lsm_db *);
+int lsmBeginWriteTrans(lsm_db *);
+int lsmBeginFlush(lsm_db *);
+
+int lsmDetectRoTrans(lsm_db *db, int *);
+int lsmBeginRoTrans(lsm_db *db);
+
+int lsmBeginWork(lsm_db *);
+void lsmFinishWork(lsm_db *, int, int *);
+
+int lsmFinishRecovery(lsm_db *);
+void lsmFinishReadTrans(lsm_db *);
+int lsmFinishWriteTrans(lsm_db *, int);
+int lsmFinishFlush(lsm_db *, int);
+
+int lsmSnapshotSetFreelist(lsm_db *, int *, int);
+
+Snapshot *lsmDbSnapshotClient(lsm_db *);
+Snapshot *lsmDbSnapshotWorker(lsm_db *);
+
+void lsmSnapshotSetCkptid(Snapshot *, i64);
+
+Level *lsmDbSnapshotLevel(Snapshot *);
+void lsmDbSnapshotSetLevel(Snapshot *, Level *);
+
+void lsmDbRecoveryComplete(lsm_db *, int);
+
+int lsmBlockAllocate(lsm_db *, int, int *);
+int lsmBlockFree(lsm_db *, int);
+int lsmBlockRefree(lsm_db *, int);
+
+void lsmFreelistDeltaBegin(lsm_db *);
+void lsmFreelistDeltaEnd(lsm_db *);
+int lsmFreelistDelta(lsm_db *pDb);
+
+DbLog *lsmDatabaseLog(lsm_db *pDb);
+
+#ifdef LSM_DEBUG
+ int lsmHoldingClientMutex(lsm_db *pDb);
+ int lsmShmAssertLock(lsm_db *db, int iLock, int eOp);
+ int lsmShmAssertWorker(lsm_db *db);
+#endif
+
+void lsmFreeSnapshot(lsm_env *, Snapshot *);
+
+
+/* Candidate values for the 3rd argument to lsmShmLock() */
+#define LSM_LOCK_UNLOCK 0
+#define LSM_LOCK_SHARED 1
+#define LSM_LOCK_EXCL 2
+
+int lsmShmCacheChunks(lsm_db *db, int nChunk);
+int lsmShmLock(lsm_db *db, int iLock, int eOp, int bBlock);
+int lsmShmTestLock(lsm_db *db, int iLock, int nLock, int eOp);
+void lsmShmBarrier(lsm_db *db);
+
+#ifdef LSM_DEBUG
+void lsmShmHasLock(lsm_db *db, int iLock, int eOp);
+#else
+# define lsmShmHasLock(x,y,z)
+#endif
+
+int lsmReadlock(lsm_db *, i64 iLsm, u32 iShmMin, u32 iShmMax);
+
+int lsmLsmInUse(lsm_db *db, i64 iLsmId, int *pbInUse);
+int lsmTreeInUse(lsm_db *db, u32 iLsmId, int *pbInUse);
+int lsmFreelistAppend(lsm_env *pEnv, Freelist *p, int iBlk, i64 iId);
+
+int lsmDbMultiProc(lsm_db *);
+void lsmDbDeferredClose(lsm_db *, lsm_file *, LsmFile *);
+LsmFile *lsmDbRecycleFd(lsm_db *);
+
+int lsmWalkFreelist(lsm_db *, int, int (*)(void *, int, i64), void *);
+
+int lsmCheckCompressionId(lsm_db *, u32);
+
+
+/**************************************************************************
+** functions in lsm_str.c
+*/
+void lsmStringInit(LsmString*, lsm_env *pEnv);
+int lsmStringExtend(LsmString*, int);
+int lsmStringAppend(LsmString*, const char *, int);
+void lsmStringVAppendf(LsmString*, const char *zFormat, va_list, va_list);
+void lsmStringAppendf(LsmString*, const char *zFormat, ...);
+void lsmStringClear(LsmString*);
+char *lsmMallocPrintf(lsm_env*, const char*, ...);
+int lsmStringBinAppend(LsmString *pStr, const u8 *a, int n);
+
+int lsmStrlen(const char *zName);
+
+
+
+/*
+** Round up a number to the next larger multiple of 8. This is used
+** to force 8-byte alignment on 64-bit architectures.
+*/
+#define ROUND8(x) (((x)+7)&~7)
+
+#define LSM_MIN(x,y) ((x)>(y) ? (y) : (x))
+#define LSM_MAX(x,y) ((x)>(y) ? (x) : (y))
+
+#endif
diff --git a/ext/lsm1/lsm_ckpt.c b/ext/lsm1/lsm_ckpt.c
new file mode 100644
index 0000000..cf4c55b
--- /dev/null
+++ b/ext/lsm1/lsm_ckpt.c
@@ -0,0 +1,1239 @@
+/*
+** 2011-09-11
+**
+** The author disclaims copyright to this source code. In place of
+** a legal notice, here is a blessing:
+**
+** May you do good and not evil.
+** May you find forgiveness for yourself and forgive others.
+** May you share freely, never taking more than you give.
+**
+*************************************************************************
+**
+** This file contains code to read and write checkpoints.
+**
+** A checkpoint represents the database layout at a single point in time.
+** It includes a log offset. When an existing database is opened, the
+** current state is determined by reading the newest checkpoint and updating
+** it with all committed transactions from the log that follow the specified
+** offset.
+*/
+#include "lsmInt.h"
+
+/*
+** CHECKPOINT BLOB FORMAT:
+**
+** A checkpoint blob is a series of unsigned 32-bit integers stored in
+** big-endian byte order. As follows:
+**
+** Checkpoint header (see the CKPT_HDR_XXX #defines):
+**
+** 1. The checkpoint id MSW.
+** 2. The checkpoint id LSW.
+** 3. The number of integer values in the entire checkpoint, including
+** the two checksum values.
+** 4. The compression scheme id.
+** 5. The total number of blocks in the database.
+** 6. The block size.
+** 7. The number of levels.
+** 8. The nominal database page size.
+** 9. The number of pages (in total) written to the database file.
+**
+** Log pointer:
+**
+** 1. The log offset MSW.
+** 2. The log offset LSW.
+** 3. Log checksum 0.
+** 4. Log checksum 1.
+**
+** Note that the "log offset" is not the literal byte offset. Instead,
+** it is the byte offset multiplied by 2, with least significant bit
+** toggled each time the log pointer value is changed. This is to make
+** sure that this field changes each time the log pointer is updated,
+** even if the log file itself is disabled. See lsmTreeMakeOld().
+**
+** See ckptExportLog() and ckptImportLog().
+**
+** Append points:
+**
+** 8 integers (4 * 64-bit page numbers). See ckptExportAppendlist().
+**
+** For each level in the database, a level record. Formatted as follows:
+**
+** 0. Age of the level (least significant 16-bits). And flags mask (most
+** significant 16-bits).
+** 1. The number of right-hand segments (nRight, possibly 0),
+** 2. Segment record for left-hand segment (8 integers defined below),
+** 3. Segment record for each right-hand segment (8 integers defined below),
+** 4. If nRight>0, The number of segments involved in the merge
+** 5. if nRight>0, Current nSkip value (see Merge structure defn.),
+** 6. For each segment in the merge:
+** 5a. Page number of next cell to read during merge (this field
+** is 64-bits - 2 integers)
+** 5b. Cell number of next cell to read during merge
+** 7. Page containing current split-key (64-bits - 2 integers).
+** 8. Cell within page containing current split-key.
+** 9. Current pointer value (64-bits - 2 integers).
+**
+** The block redirect array:
+**
+** 1. Number of redirections (maximum LSM_MAX_BLOCK_REDIRECTS).
+** 2. For each redirection:
+** a. "from" block number
+** b. "to" block number
+**
+** The in-memory freelist entries. Each entry is either an insert or a
+** delete. The in-memory freelist is to the free-block-list as the
+** in-memory tree is to the users database content.
+**
+** 1. Number of free-list entries stored in checkpoint header.
+** 2. Number of free blocks (in total).
+** 3. Total number of blocks freed during database lifetime.
+** 4. For each entry:
+** 2a. Block number of free block.
+** 2b. A 64-bit integer (MSW followed by LSW). -1 for a delete entry,
+** or the associated checkpoint id for an insert.
+**
+** The checksum:
+**
+** 1. Checksum value 1.
+** 2. Checksum value 2.
+**
+** In the above, a segment record consists of the following four 64-bit
+** fields (converted to 2 * u32 by storing the MSW followed by LSW):
+**
+** 1. First page of array,
+** 2. Last page of array,
+** 3. Root page of array (or 0),
+** 4. Size of array in pages.
+*/
+
+/*
+** LARGE NUMBERS OF LEVEL RECORDS:
+**
+** A limit on the number of rhs segments that may be present in the database
+** file. Defining this limit ensures that all level records fit within
+** the 4096 byte limit for checkpoint blobs.
+**
+** The number of right-hand-side segments in a database is counted as
+** follows:
+**
+** * For each level in the database not undergoing a merge, add 1.
+**
+** * For each level in the database that is undergoing a merge, add
+** the number of segments on the rhs of the level.
+**
+** A level record not undergoing a merge is 10 integers. A level record
+** with nRhs rhs segments and (nRhs+1) input segments (i.e. including the
+** separators from the next level) is (11*nRhs+20) integers. The maximum
+** per right-hand-side level is therefore 21 integers. So the maximum
+** size of all level records in a checkpoint is 21*40=820 integers.
+**
+** TODO: Before pointer values were changed from 32 to 64 bits, the above
+** used to come to 420 bytes - leaving significant space for a free-list
+** prefix. No more. To fix this, reduce the size of the level records in
+** a db snapshot, and improve management of the free-list tail in
+** lsm_sorted.c.
+*/
+#define LSM_MAX_RHS_SEGMENTS 40
+
+/*
+** LARGE NUMBERS OF FREELIST ENTRIES:
+**
+** There is also a limit (LSM_MAX_FREELIST_ENTRIES - defined in lsmInt.h)
+** on the number of free-list entries stored in a checkpoint. Since each
+** free-list entry consists of 3 integers, the maximum free-list size is
+** 3*100=300 integers. Combined with the limit on rhs segments defined
+** above, this ensures that a checkpoint always fits within a 4096 byte
+** meta page.
+**
+** If the database contains more than 100 free blocks, the "overflow" flag
+** in the checkpoint header is set and the remainder are stored in the
+** system FREELIST entry in the LSM (along with user data). The value
+** accompanying the FREELIST key in the LSM is, like a checkpoint, an array
+** of 32-bit big-endian integers. As follows:
+**
+** For each entry:
+** a. Block number of free block.
+** b. MSW of associated checkpoint id.
+** c. LSW of associated checkpoint id.
+**
+** The number of entries is not required - it is implied by the size of the
+** value blob containing the integer array.
+**
+** Note that the limit defined by LSM_MAX_FREELIST_ENTRIES is a hard limit.
+** The actual value used may be configured using LSM_CONFIG_MAX_FREELIST.
+*/
+
+/*
+** The argument to this macro must be of type u32. On a little-endian
+** architecture, it returns the u32 value that results from interpreting
+** the 4 bytes as a big-endian value. On a big-endian architecture, it
+** returns the value that would be produced by intepreting the 4 bytes
+** of the input value as a little-endian integer.
+*/
+#define BYTESWAP32(x) ( \
+ (((x)&0x000000FF)<<24) + (((x)&0x0000FF00)<<8) \
+ + (((x)&0x00FF0000)>>8) + (((x)&0xFF000000)>>24) \
+)
+
+static const int one = 1;
+#define LSM_LITTLE_ENDIAN (*(u8 *)(&one))
+
+/* Sizes, in integers, of various parts of the checkpoint. */
+#define CKPT_HDR_SIZE 9
+#define CKPT_LOGPTR_SIZE 4
+#define CKPT_APPENDLIST_SIZE (LSM_APPLIST_SZ * 2)
+
+/* A #define to describe each integer in the checkpoint header. */
+#define CKPT_HDR_ID_MSW 0
+#define CKPT_HDR_ID_LSW 1
+#define CKPT_HDR_NCKPT 2
+#define CKPT_HDR_CMPID 3
+#define CKPT_HDR_NBLOCK 4
+#define CKPT_HDR_BLKSZ 5
+#define CKPT_HDR_NLEVEL 6
+#define CKPT_HDR_PGSZ 7
+#define CKPT_HDR_NWRITE 8
+
+#define CKPT_HDR_LO_MSW 9
+#define CKPT_HDR_LO_LSW 10
+#define CKPT_HDR_LO_CKSUM1 11
+#define CKPT_HDR_LO_CKSUM2 12
+
+typedef struct CkptBuffer CkptBuffer;
+
+/*
+** Dynamic buffer used to accumulate data for a checkpoint.
+*/
+struct CkptBuffer {
+ lsm_env *pEnv;
+ int nAlloc;
+ u32 *aCkpt;
+};
+
+/*
+** Calculate the checksum of the checkpoint specified by arguments aCkpt and
+** nCkpt. Store the checksum in *piCksum1 and *piCksum2 before returning.
+**
+** The value of the nCkpt parameter includes the two checksum values at
+** the end of the checkpoint. They are not used as inputs to the checksum
+** calculation. The checksum is based on the array of (nCkpt-2) integers
+** at aCkpt[].
+*/
+static void ckptChecksum(u32 *aCkpt, u32 nCkpt, u32 *piCksum1, u32 *piCksum2){
+ u32 i;
+ u32 cksum1 = 1;
+ u32 cksum2 = 2;
+
+ if( nCkpt % 2 ){
+ cksum1 += aCkpt[nCkpt-3] & 0x0000FFFF;
+ cksum2 += aCkpt[nCkpt-3] & 0xFFFF0000;
+ }
+
+ for(i=0; (i+3)=p->nAlloc ){
+ int nNew = LSM_MAX(8, iIdx*2);
+ p->aCkpt = (u32 *)lsmReallocOrFree(p->pEnv, p->aCkpt, nNew*sizeof(u32));
+ if( !p->aCkpt ){
+ *pRc = LSM_NOMEM_BKPT;
+ return;
+ }
+ p->nAlloc = nNew;
+ }
+ p->aCkpt[iIdx] = iVal;
+}
+
+/*
+** Argument aInt points to an array nInt elements in size. Switch the
+** endian-ness of each element of the array.
+*/
+static void ckptChangeEndianness(u32 *aInt, int nInt){
+ if( LSM_LITTLE_ENDIAN ){
+ int i;
+ for(i=0; iaCkpt, nCkpt+2, &aCksum[0], &aCksum[1]);
+ ckptSetValue(p, nCkpt, aCksum[0], pRc);
+ ckptSetValue(p, nCkpt+1, aCksum[1], pRc);
+ }
+}
+
+static void ckptAppend64(CkptBuffer *p, int *piOut, i64 iVal, int *pRc){
+ int iOut = *piOut;
+ ckptSetValue(p, iOut++, (iVal >> 32) & 0xFFFFFFFF, pRc);
+ ckptSetValue(p, iOut++, (iVal & 0xFFFFFFFF), pRc);
+ *piOut = iOut;
+}
+
+static i64 ckptRead64(u32 *a){
+ return (((i64)a[0]) << 32) + (i64)a[1];
+}
+
+static i64 ckptGobble64(u32 *a, int *piIn){
+ int iIn = *piIn;
+ *piIn += 2;
+ return ckptRead64(&a[iIn]);
+}
+
+
+/*
+** Append a 6-value segment record corresponding to pSeg to the checkpoint
+** buffer passed as the third argument.
+*/
+static void ckptExportSegment(
+ Segment *pSeg,
+ CkptBuffer *p,
+ int *piOut,
+ int *pRc
+){
+ ckptAppend64(p, piOut, pSeg->iFirst, pRc);
+ ckptAppend64(p, piOut, pSeg->iLastPg, pRc);
+ ckptAppend64(p, piOut, pSeg->iRoot, pRc);
+ ckptAppend64(p, piOut, pSeg->nSize, pRc);
+}
+
+static void ckptExportLevel(
+ Level *pLevel, /* Level object to serialize */
+ CkptBuffer *p, /* Append new level record to this ckpt */
+ int *piOut, /* IN/OUT: Size of checkpoint so far */
+ int *pRc /* IN/OUT: Error code */
+){
+ int iOut = *piOut;
+ Merge *pMerge;
+
+ pMerge = pLevel->pMerge;
+ ckptSetValue(p, iOut++, (u32)pLevel->iAge + (u32)(pLevel->flags<<16), pRc);
+ ckptSetValue(p, iOut++, pLevel->nRight, pRc);
+ ckptExportSegment(&pLevel->lhs, p, &iOut, pRc);
+
+ assert( (pLevel->nRight>0)==(pMerge!=0) );
+ if( pMerge ){
+ int i;
+ for(i=0; inRight; i++){
+ ckptExportSegment(&pLevel->aRhs[i], p, &iOut, pRc);
+ }
+ assert( pMerge->nInput==pLevel->nRight
+ || pMerge->nInput==pLevel->nRight+1
+ );
+ ckptSetValue(p, iOut++, pMerge->nInput, pRc);
+ ckptSetValue(p, iOut++, pMerge->nSkip, pRc);
+ for(i=0; inInput; i++){
+ ckptAppend64(p, &iOut, pMerge->aInput[i].iPg, pRc);
+ ckptSetValue(p, iOut++, pMerge->aInput[i].iCell, pRc);
+ }
+ ckptAppend64(p, &iOut, pMerge->splitkey.iPg, pRc);
+ ckptSetValue(p, iOut++, pMerge->splitkey.iCell, pRc);
+ ckptAppend64(p, &iOut, pMerge->iCurrentPtr, pRc);
+ }
+
+ *piOut = iOut;
+}
+
+/*
+** Populate the log offset fields of the checkpoint buffer. 4 values.
+*/
+static void ckptExportLog(
+ lsm_db *pDb,
+ int bFlush,
+ CkptBuffer *p,
+ int *piOut,
+ int *pRc
+){
+ int iOut = *piOut;
+
+ assert( iOut==CKPT_HDR_LO_MSW );
+
+ if( bFlush ){
+ i64 iOff = pDb->treehdr.iOldLog;
+ ckptAppend64(p, &iOut, iOff, pRc);
+ ckptSetValue(p, iOut++, pDb->treehdr.oldcksum0, pRc);
+ ckptSetValue(p, iOut++, pDb->treehdr.oldcksum1, pRc);
+ }else{
+ for(; iOut<=CKPT_HDR_LO_CKSUM2; iOut++){
+ ckptSetValue(p, iOut, pDb->pShmhdr->aSnap2[iOut], pRc);
+ }
+ }
+
+ assert( *pRc || iOut==CKPT_HDR_LO_CKSUM2+1 );
+ *piOut = iOut;
+}
+
+static void ckptExportAppendlist(
+ lsm_db *db, /* Database connection */
+ CkptBuffer *p, /* Checkpoint buffer to write to */
+ int *piOut, /* IN/OUT: Offset within checkpoint buffer */
+ int *pRc /* IN/OUT: Error code */
+){
+ int i;
+ Pgno *aiAppend = db->pWorker->aiAppend;
+
+ for(i=0; ipFS; /* File system object */
+ Snapshot *pSnap = pDb->pWorker; /* Worker snapshot */
+ int nLevel = 0; /* Number of levels in checkpoint */
+ int iLevel; /* Used to count out nLevel levels */
+ int iOut = 0; /* Current offset in aCkpt[] */
+ Level *pLevel; /* Level iterator */
+ int i; /* Iterator used while serializing freelist */
+ CkptBuffer ckpt;
+
+ /* Initialize the output buffer */
+ memset(&ckpt, 0, sizeof(CkptBuffer));
+ ckpt.pEnv = pDb->pEnv;
+ iOut = CKPT_HDR_SIZE;
+
+ /* Write the log offset into the checkpoint. */
+ ckptExportLog(pDb, bLog, &ckpt, &iOut, &rc);
+
+ /* Write the append-point list */
+ ckptExportAppendlist(pDb, &ckpt, &iOut, &rc);
+
+ /* Figure out how many levels will be written to the checkpoint. */
+ for(pLevel=lsmDbSnapshotLevel(pSnap); pLevel; pLevel=pLevel->pNext) nLevel++;
+
+ /* Serialize nLevel levels. */
+ iLevel = 0;
+ for(pLevel=lsmDbSnapshotLevel(pSnap); iLevelpNext){
+ ckptExportLevel(pLevel, &ckpt, &iOut, &rc);
+ iLevel++;
+ }
+
+ /* Write the block-redirect list */
+ ckptSetValue(&ckpt, iOut++, pSnap->redirect.n, &rc);
+ for(i=0; iredirect.n; i++){
+ ckptSetValue(&ckpt, iOut++, pSnap->redirect.a[i].iFrom, &rc);
+ ckptSetValue(&ckpt, iOut++, pSnap->redirect.a[i].iTo, &rc);
+ }
+
+ /* Write the freelist */
+ assert( pSnap->freelist.nEntry<=pDb->nMaxFreelist );
+ if( rc==LSM_OK ){
+ int nFree = pSnap->freelist.nEntry;
+ ckptSetValue(&ckpt, iOut++, nFree, &rc);
+ for(i=0; ifreelist.aEntry[i];
+ ckptSetValue(&ckpt, iOut++, p->iBlk, &rc);
+ ckptSetValue(&ckpt, iOut++, (p->iId >> 32) & 0xFFFFFFFF, &rc);
+ ckptSetValue(&ckpt, iOut++, p->iId & 0xFFFFFFFF, &rc);
+ }
+ }
+
+ /* Write the checkpoint header */
+ assert( iId>=0 );
+ assert( pSnap->iCmpId==pDb->compress.iId
+ || pSnap->iCmpId==LSM_COMPRESSION_EMPTY
+ );
+ ckptSetValue(&ckpt, CKPT_HDR_ID_MSW, (u32)(iId>>32), &rc);
+ ckptSetValue(&ckpt, CKPT_HDR_ID_LSW, (u32)(iId&0xFFFFFFFF), &rc);
+ ckptSetValue(&ckpt, CKPT_HDR_NCKPT, iOut+2, &rc);
+ ckptSetValue(&ckpt, CKPT_HDR_CMPID, pDb->compress.iId, &rc);
+ ckptSetValue(&ckpt, CKPT_HDR_NBLOCK, pSnap->nBlock, &rc);
+ ckptSetValue(&ckpt, CKPT_HDR_BLKSZ, lsmFsBlockSize(pFS), &rc);
+ ckptSetValue(&ckpt, CKPT_HDR_NLEVEL, nLevel, &rc);
+ ckptSetValue(&ckpt, CKPT_HDR_PGSZ, lsmFsPageSize(pFS), &rc);
+ ckptSetValue(&ckpt, CKPT_HDR_NWRITE, pSnap->nWrite, &rc);
+
+ if( bCksum ){
+ ckptAddChecksum(&ckpt, iOut, &rc);
+ }else{
+ ckptSetValue(&ckpt, iOut, 0, &rc);
+ ckptSetValue(&ckpt, iOut+1, 0, &rc);
+ }
+ iOut += 2;
+ assert( iOut<=1024 );
+
+#ifdef LSM_LOG_FREELIST
+ lsmLogMessage(pDb, rc,
+ "ckptExportSnapshot(): id=%lld freelist: %d", iId, pSnap->freelist.nEntry
+ );
+ for(i=0; ifreelist.nEntry; i++){
+ lsmLogMessage(pDb, rc,
+ "ckptExportSnapshot(): iBlk=%d id=%lld",
+ pSnap->freelist.aEntry[i].iBlk,
+ pSnap->freelist.aEntry[i].iId
+ );
+ }
+#endif
+
+ *ppCkpt = (void *)ckpt.aCkpt;
+ if( pnCkpt ) *pnCkpt = sizeof(u32)*iOut;
+ return rc;
+}
+
+
+/*
+** Helper function for ckptImport().
+*/
+static void ckptNewSegment(
+ u32 *aIn,
+ int *piIn,
+ Segment *pSegment /* Populate this structure */
+){
+ assert( pSegment->iFirst==0 && pSegment->iLastPg==0 );
+ assert( pSegment->nSize==0 && pSegment->iRoot==0 );
+ pSegment->iFirst = ckptGobble64(aIn, piIn);
+ pSegment->iLastPg = ckptGobble64(aIn, piIn);
+ pSegment->iRoot = ckptGobble64(aIn, piIn);
+ pSegment->nSize = (int)ckptGobble64(aIn, piIn);
+ assert( pSegment->iFirst );
+}
+
+static int ckptSetupMerge(lsm_db *pDb, u32 *aInt, int *piIn, Level *pLevel){
+ Merge *pMerge; /* Allocated Merge object */
+ int nInput; /* Number of input segments in merge */
+ int iIn = *piIn; /* Next value to read from aInt[] */
+ int i; /* Iterator variable */
+ int nByte; /* Number of bytes to allocate */
+
+ /* Allocate the Merge object. If malloc() fails, return LSM_NOMEM. */
+ nInput = (int)aInt[iIn++];
+ nByte = sizeof(Merge) + sizeof(MergeInput) * nInput;
+ pMerge = (Merge *)lsmMallocZero(pDb->pEnv, nByte);
+ if( !pMerge ) return LSM_NOMEM_BKPT;
+ pLevel->pMerge = pMerge;
+
+ /* Populate the Merge object. */
+ pMerge->aInput = (MergeInput *)&pMerge[1];
+ pMerge->nInput = nInput;
+ pMerge->iOutputOff = -1;
+ pMerge->nSkip = (int)aInt[iIn++];
+ for(i=0; iaInput[i].iPg = ckptGobble64(aInt, &iIn);
+ pMerge->aInput[i].iCell = (int)aInt[iIn++];
+ }
+ pMerge->splitkey.iPg = ckptGobble64(aInt, &iIn);
+ pMerge->splitkey.iCell = (int)aInt[iIn++];
+ pMerge->iCurrentPtr = ckptGobble64(aInt, &iIn);
+
+ /* Set *piIn and return LSM_OK. */
+ *piIn = iIn;
+ return LSM_OK;
+}
+
+
+static int ckptLoadLevels(
+ lsm_db *pDb,
+ u32 *aIn,
+ int *piIn,
+ int nLevel,
+ Level **ppLevel
+){
+ int i;
+ int rc = LSM_OK;
+ Level *pRet = 0;
+ Level **ppNext;
+ int iIn = *piIn;
+
+ ppNext = &pRet;
+ for(i=0; rc==LSM_OK && ipEnv, sizeof(Level), &rc);
+ if( rc==LSM_OK ){
+ pLevel->iAge = (u16)(aIn[iIn] & 0x0000FFFF);
+ pLevel->flags = (u16)((aIn[iIn]>>16) & 0x0000FFFF);
+ iIn++;
+ pLevel->nRight = aIn[iIn++];
+ if( pLevel->nRight ){
+ int nByte = sizeof(Segment) * pLevel->nRight;
+ pLevel->aRhs = (Segment *)lsmMallocZeroRc(pDb->pEnv, nByte, &rc);
+ }
+ if( rc==LSM_OK ){
+ *ppNext = pLevel;
+ ppNext = &pLevel->pNext;
+
+ /* Allocate the main segment */
+ ckptNewSegment(aIn, &iIn, &pLevel->lhs);
+
+ /* Allocate each of the right-hand segments, if any */
+ for(iRight=0; iRightnRight; iRight++){
+ ckptNewSegment(aIn, &iIn, &pLevel->aRhs[iRight]);
+ }
+
+ /* Set up the Merge object, if required */
+ if( pLevel->nRight>0 ){
+ rc = ckptSetupMerge(pDb, aIn, &iIn, pLevel);
+ }
+ }
+ }
+ }
+
+ if( rc!=LSM_OK ){
+ /* An OOM must have occurred. Free any level structures allocated and
+ ** return the error to the caller. */
+ lsmSortedFreeLevel(pDb->pEnv, pRet);
+ pRet = 0;
+ }
+
+ *ppLevel = pRet;
+ *piIn = iIn;
+ return rc;
+}
+
+
+int lsmCheckpointLoadLevels(lsm_db *pDb, void *pVal, int nVal){
+ int rc = LSM_OK;
+ if( nVal>0 ){
+ u32 *aIn;
+
+ aIn = lsmMallocRc(pDb->pEnv, nVal, &rc);
+ if( aIn ){
+ Level *pLevel = 0;
+ Level *pParent;
+
+ int nIn;
+ int nLevel;
+ int iIn = 1;
+ memcpy(aIn, pVal, nVal);
+ nIn = nVal / sizeof(u32);
+
+ ckptChangeEndianness(aIn, nIn);
+ nLevel = aIn[0];
+ rc = ckptLoadLevels(pDb, aIn, &iIn, nLevel, &pLevel);
+ lsmFree(pDb->pEnv, aIn);
+ assert( rc==LSM_OK || pLevel==0 );
+ if( rc==LSM_OK ){
+ pParent = lsmDbSnapshotLevel(pDb->pWorker);
+ assert( pParent );
+ while( pParent->pNext ) pParent = pParent->pNext;
+ pParent->pNext = pLevel;
+ }
+ }
+ }
+
+ return rc;
+}
+
+/*
+** Return the data for the LEVELS record.
+**
+** The size of the checkpoint that can be stored in the database header
+** must not exceed 1024 32-bit integers. Normally, it does not. However,
+** if it does, part of the checkpoint must be stored in the LSM. This
+** routine returns that part.
+*/
+int lsmCheckpointLevels(
+ lsm_db *pDb, /* Database handle */
+ int nLevel, /* Number of levels to write to blob */
+ void **paVal, /* OUT: Pointer to LEVELS blob */
+ int *pnVal /* OUT: Size of LEVELS blob in bytes */
+){
+ Level *p; /* Used to iterate through levels */
+ int nAll= 0;
+ int rc;
+ int i;
+ int iOut;
+ CkptBuffer ckpt;
+ assert( nLevel>0 );
+
+ for(p=lsmDbSnapshotLevel(pDb->pWorker); p; p=p->pNext) nAll++;
+
+ assert( nAll>nLevel );
+ nAll -= nLevel;
+ for(p=lsmDbSnapshotLevel(pDb->pWorker); p && nAll>0; p=p->pNext) nAll--;
+
+ memset(&ckpt, 0, sizeof(CkptBuffer));
+ ckpt.pEnv = pDb->pEnv;
+
+ ckptSetValue(&ckpt, 0, nLevel, &rc);
+ iOut = 1;
+ for(i=0; rc==LSM_OK && ipNext;
+ }
+ assert( rc!=LSM_OK || p==0 );
+
+ if( rc==LSM_OK ){
+ ckptChangeEndianness(ckpt.aCkpt, iOut);
+ *paVal = (void *)ckpt.aCkpt;
+ *pnVal = iOut * sizeof(u32);
+ }else{
+ *pnVal = 0;
+ *paVal = 0;
+ }
+
+ return rc;
+}
+
+/*
+** Read the checkpoint id from meta-page pPg.
+*/
+static i64 ckptLoadId(MetaPage *pPg){
+ i64 ret = 0;
+ if( pPg ){
+ int nData;
+ u8 *aData = lsmFsMetaPageData(pPg, &nData);
+ ret = (((i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4])) << 32) +
+ ((i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4]));
+ }
+ return ret;
+}
+
+/*
+** Return true if the buffer passed as an argument contains a valid
+** checkpoint.
+*/
+static int ckptChecksumOk(u32 *aCkpt){
+ u32 nCkpt = aCkpt[CKPT_HDR_NCKPT];
+ u32 cksum1;
+ u32 cksum2;
+
+ if( nCkpt(LSM_META_RW_PAGE_SIZE)/sizeof(u32) ){
+ return 0;
+ }
+ ckptChecksum(aCkpt, nCkpt, &cksum1, &cksum2);
+ return (cksum1==aCkpt[nCkpt-2] && cksum2==aCkpt[nCkpt-1]);
+}
+
+/*
+** Attempt to load a checkpoint from meta page iMeta.
+**
+** This function is a no-op if *pRc is set to any value other than LSM_OK
+** when it is called. If an error occurs, *pRc is set to an LSM error code
+** before returning.
+**
+** If no error occurs and the checkpoint is successfully loaded, copy it to
+** ShmHeader.aSnap1[] and ShmHeader.aSnap2[], and set ShmHeader.iMetaPage
+** to indicate its origin. In this case return 1. Or, if the checkpoint
+** cannot be loaded (because the checksum does not compute), return 0.
+*/
+static int ckptTryLoad(lsm_db *pDb, MetaPage *pPg, u32 iMeta, int *pRc){
+ int bLoaded = 0; /* Return value */
+ if( *pRc==LSM_OK ){
+ int rc = LSM_OK; /* Error code */
+ u32 *aCkpt = 0; /* Pointer to buffer containing checkpoint */
+ u32 nCkpt; /* Number of elements in aCkpt[] */
+ int nData; /* Bytes of data in aData[] */
+ u8 *aData; /* Meta page data */
+
+ aData = lsmFsMetaPageData(pPg, &nData);
+ nCkpt = (u32)lsmGetU32(&aData[CKPT_HDR_NCKPT*sizeof(u32)]);
+ if( nCkpt<=nData/sizeof(u32) && nCkpt>CKPT_HDR_NCKPT ){
+ aCkpt = (u32 *)lsmMallocRc(pDb->pEnv, nCkpt*sizeof(u32), &rc);
+ }
+ if( aCkpt ){
+ memcpy(aCkpt, aData, nCkpt*sizeof(u32));
+ ckptChangeEndianness(aCkpt, nCkpt);
+ if( ckptChecksumOk(aCkpt) ){
+ ShmHeader *pShm = pDb->pShmhdr;
+ memcpy(pShm->aSnap1, aCkpt, nCkpt*sizeof(u32));
+ memcpy(pShm->aSnap2, aCkpt, nCkpt*sizeof(u32));
+ memcpy(pDb->aSnapshot, aCkpt, nCkpt*sizeof(u32));
+ pShm->iMetaPage = iMeta;
+ bLoaded = 1;
+ }
+ }
+
+ lsmFree(pDb->pEnv, aCkpt);
+ *pRc = rc;
+ }
+ return bLoaded;
+}
+
+/*
+** Initialize the shared-memory header with an empty snapshot. This function
+** is called when no valid snapshot can be found in the database header.
+*/
+static void ckptLoadEmpty(lsm_db *pDb){
+ u32 aCkpt[] = {
+ 0, /* CKPT_HDR_ID_MSW */
+ 10, /* CKPT_HDR_ID_LSW */
+ 0, /* CKPT_HDR_NCKPT */
+ LSM_COMPRESSION_EMPTY, /* CKPT_HDR_CMPID */
+ 0, /* CKPT_HDR_NBLOCK */
+ 0, /* CKPT_HDR_BLKSZ */
+ 0, /* CKPT_HDR_NLEVEL */
+ 0, /* CKPT_HDR_PGSZ */
+ 0, /* CKPT_HDR_NWRITE */
+ 0, 0, 1234, 5678, /* The log pointer and initial checksum */
+ 0,0,0,0, 0,0,0,0, /* The append list */
+ 0, /* The redirected block list */
+ 0, /* The free block list */
+ 0, 0 /* Space for checksum values */
+ };
+ u32 nCkpt = array_size(aCkpt);
+ ShmHeader *pShm = pDb->pShmhdr;
+
+ aCkpt[CKPT_HDR_NCKPT] = nCkpt;
+ aCkpt[CKPT_HDR_BLKSZ] = pDb->nDfltBlksz;
+ aCkpt[CKPT_HDR_PGSZ] = pDb->nDfltPgsz;
+ ckptChecksum(aCkpt, array_size(aCkpt), &aCkpt[nCkpt-2], &aCkpt[nCkpt-1]);
+
+ memcpy(pShm->aSnap1, aCkpt, nCkpt*sizeof(u32));
+ memcpy(pShm->aSnap2, aCkpt, nCkpt*sizeof(u32));
+ memcpy(pDb->aSnapshot, aCkpt, nCkpt*sizeof(u32));
+}
+
+/*
+** This function is called as part of database recovery to initialize the
+** ShmHeader.aSnap1[] and ShmHeader.aSnap2[] snapshots.
+*/
+int lsmCheckpointRecover(lsm_db *pDb){
+ int rc = LSM_OK; /* Return Code */
+ i64 iId1; /* Id of checkpoint on meta-page 1 */
+ i64 iId2; /* Id of checkpoint on meta-page 2 */
+ int bLoaded = 0; /* True once checkpoint has been loaded */
+ int cmp; /* True if (iId2>iId1) */
+ MetaPage *apPg[2] = {0, 0}; /* Meta-pages 1 and 2 */
+
+ rc = lsmFsMetaPageGet(pDb->pFS, 0, 1, &apPg[0]);
+ if( rc==LSM_OK ) rc = lsmFsMetaPageGet(pDb->pFS, 0, 2, &apPg[1]);
+
+ iId1 = ckptLoadId(apPg[0]);
+ iId2 = ckptLoadId(apPg[1]);
+ cmp = (iId2 > iId1);
+ bLoaded = ckptTryLoad(pDb, apPg[cmp?1:0], (cmp?2:1), &rc);
+ if( bLoaded==0 ){
+ bLoaded = ckptTryLoad(pDb, apPg[cmp?0:1], (cmp?1:2), &rc);
+ }
+
+ /* The database does not contain a valid checkpoint. Initialize the shared
+ ** memory header with an empty checkpoint. */
+ if( bLoaded==0 ){
+ ckptLoadEmpty(pDb);
+ }
+
+ lsmFsMetaPageRelease(apPg[0]);
+ lsmFsMetaPageRelease(apPg[1]);
+
+ return rc;
+}
+
+/*
+** Store the snapshot in pDb->aSnapshot[] in meta-page iMeta.
+*/
+int lsmCheckpointStore(lsm_db *pDb, int iMeta){
+ MetaPage *pPg = 0;
+ int rc;
+
+ assert( iMeta==1 || iMeta==2 );
+ rc = lsmFsMetaPageGet(pDb->pFS, 1, iMeta, &pPg);
+ if( rc==LSM_OK ){
+ u8 *aData;
+ int nData;
+ int nCkpt;
+
+ nCkpt = (int)pDb->aSnapshot[CKPT_HDR_NCKPT];
+ aData = lsmFsMetaPageData(pPg, &nData);
+ memcpy(aData, pDb->aSnapshot, nCkpt*sizeof(u32));
+ ckptChangeEndianness((u32 *)aData, nCkpt);
+ rc = lsmFsMetaPageRelease(pPg);
+ }
+
+ return rc;
+}
+
+/*
+** Copy the current client snapshot from shared-memory to pDb->aSnapshot[].
+*/
+int lsmCheckpointLoad(lsm_db *pDb, int *piRead){
+ int nRem = LSM_ATTEMPTS_BEFORE_PROTOCOL;
+ ShmHeader *pShm = pDb->pShmhdr;
+ while( (nRem--)>0 ){
+ int nInt;
+
+ nInt = pShm->aSnap1[CKPT_HDR_NCKPT];
+ if( nInt<=(LSM_META_RW_PAGE_SIZE / sizeof(u32)) ){
+ memcpy(pDb->aSnapshot, pShm->aSnap1, nInt*sizeof(u32));
+ if( ckptChecksumOk(pDb->aSnapshot) ){
+ if( piRead ) *piRead = 1;
+ return LSM_OK;
+ }
+ }
+
+ nInt = pShm->aSnap2[CKPT_HDR_NCKPT];
+ if( nInt<=(LSM_META_RW_PAGE_SIZE / sizeof(u32)) ){
+ memcpy(pDb->aSnapshot, pShm->aSnap2, nInt*sizeof(u32));
+ if( ckptChecksumOk(pDb->aSnapshot) ){
+ if( piRead ) *piRead = 2;
+ return LSM_OK;
+ }
+ }
+
+ lsmShmBarrier(pDb);
+ }
+ return LSM_PROTOCOL_BKPT;
+}
+
+int lsmInfoCompressionId(lsm_db *db, u32 *piCmpId){
+ int rc;
+
+ assert( db->pClient==0 && db->pWorker==0 );
+ rc = lsmCheckpointLoad(db, 0);
+ if( rc==LSM_OK ){
+ *piCmpId = db->aSnapshot[CKPT_HDR_CMPID];
+ }
+
+ return rc;
+}
+
+int lsmCheckpointLoadOk(lsm_db *pDb, int iSnap){
+ u32 *aShm;
+ assert( iSnap==1 || iSnap==2 );
+ aShm = (iSnap==1) ? pDb->pShmhdr->aSnap1 : pDb->pShmhdr->aSnap2;
+ return (lsmCheckpointId(pDb->aSnapshot, 0)==lsmCheckpointId(aShm, 0) );
+}
+
+int lsmCheckpointClientCacheOk(lsm_db *pDb){
+ return ( pDb->pClient
+ && pDb->pClient->iId==lsmCheckpointId(pDb->aSnapshot, 0)
+ && pDb->pClient->iId==lsmCheckpointId(pDb->pShmhdr->aSnap1, 0)
+ && pDb->pClient->iId==lsmCheckpointId(pDb->pShmhdr->aSnap2, 0)
+ );
+}
+
+int lsmCheckpointLoadWorker(lsm_db *pDb){
+ int rc;
+ ShmHeader *pShm = pDb->pShmhdr;
+ int nInt1;
+ int nInt2;
+
+ /* Must be holding the WORKER lock to do this. Or DMS2. */
+ assert(
+ lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL)
+ || lsmShmAssertLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL)
+ );
+
+ /* Check that the two snapshots match. If not, repair them. */
+ nInt1 = pShm->aSnap1[CKPT_HDR_NCKPT];
+ nInt2 = pShm->aSnap2[CKPT_HDR_NCKPT];
+ if( nInt1!=nInt2 || memcmp(pShm->aSnap1, pShm->aSnap2, nInt2*sizeof(u32)) ){
+ if( ckptChecksumOk(pShm->aSnap1) ){
+ memcpy(pShm->aSnap2, pShm->aSnap1, sizeof(u32)*nInt1);
+ }else if( ckptChecksumOk(pShm->aSnap2) ){
+ memcpy(pShm->aSnap1, pShm->aSnap2, sizeof(u32)*nInt2);
+ }else{
+ return LSM_PROTOCOL_BKPT;
+ }
+ }
+
+ rc = lsmCheckpointDeserialize(pDb, 1, pShm->aSnap1, &pDb->pWorker);
+ if( pDb->pWorker ) pDb->pWorker->pDatabase = pDb->pDatabase;
+
+ if( rc==LSM_OK ){
+ rc = lsmCheckCompressionId(pDb, pDb->pWorker->iCmpId);
+ }
+
+#if 0
+ assert( rc!=LSM_OK || lsmFsIntegrityCheck(pDb) );
+#endif
+ return rc;
+}
+
+int lsmCheckpointDeserialize(
+ lsm_db *pDb,
+ int bInclFreelist, /* If true, deserialize free-list */
+ u32 *aCkpt,
+ Snapshot **ppSnap
+){
+ int rc = LSM_OK;
+ Snapshot *pNew;
+
+ pNew = (Snapshot *)lsmMallocZeroRc(pDb->pEnv, sizeof(Snapshot), &rc);
+ if( rc==LSM_OK ){
+ Level *pLvl;
+ int nFree;
+ int i;
+ int nLevel = (int)aCkpt[CKPT_HDR_NLEVEL];
+ int iIn = CKPT_HDR_SIZE + CKPT_APPENDLIST_SIZE + CKPT_LOGPTR_SIZE;
+
+ pNew->iId = lsmCheckpointId(aCkpt, 0);
+ pNew->nBlock = aCkpt[CKPT_HDR_NBLOCK];
+ pNew->nWrite = aCkpt[CKPT_HDR_NWRITE];
+ rc = ckptLoadLevels(pDb, aCkpt, &iIn, nLevel, &pNew->pLevel);
+ pNew->iLogOff = lsmCheckpointLogOffset(aCkpt);
+ pNew->iCmpId = aCkpt[CKPT_HDR_CMPID];
+
+ /* Make a copy of the append-list */
+ for(i=0; iaiAppend[i] = ckptRead64(a);
+ }
+
+ /* Read the block-redirect list */
+ pNew->redirect.n = aCkpt[iIn++];
+ if( pNew->redirect.n ){
+ pNew->redirect.a = lsmMallocZeroRc(pDb->pEnv,
+ (sizeof(struct RedirectEntry) * LSM_MAX_BLOCK_REDIRECTS), &rc
+ );
+ if( rc==LSM_OK ){
+ for(i=0; iredirect.n; i++){
+ pNew->redirect.a[i].iFrom = aCkpt[iIn++];
+ pNew->redirect.a[i].iTo = aCkpt[iIn++];
+ }
+ }
+ for(pLvl=pNew->pLevel; pLvl->pNext; pLvl=pLvl->pNext);
+ if( pLvl->nRight ){
+ pLvl->aRhs[pLvl->nRight-1].pRedirect = &pNew->redirect;
+ }else{
+ pLvl->lhs.pRedirect = &pNew->redirect;
+ }
+ }
+
+ /* Copy the free-list */
+ if( rc==LSM_OK && bInclFreelist ){
+ nFree = aCkpt[iIn++];
+ if( nFree ){
+ pNew->freelist.aEntry = (FreelistEntry *)lsmMallocZeroRc(
+ pDb->pEnv, sizeof(FreelistEntry)*nFree, &rc
+ );
+ if( rc==LSM_OK ){
+ int j;
+ for(j=0; jfreelist.aEntry[j];
+ p->iBlk = aCkpt[iIn++];
+ p->iId = ((i64)(aCkpt[iIn])<<32) + aCkpt[iIn+1];
+ iIn += 2;
+ }
+ pNew->freelist.nEntry = pNew->freelist.nAlloc = nFree;
+ }
+ }
+ }
+ }
+
+ if( rc!=LSM_OK ){
+ lsmFreeSnapshot(pDb->pEnv, pNew);
+ pNew = 0;
+ }
+
+ *ppSnap = pNew;
+ return rc;
+}
+
+/*
+** Connection pDb must be the worker connection in order to call this
+** function. It returns true if the database already contains the maximum
+** number of levels or false otherwise.
+**
+** This is used when flushing the in-memory tree to disk. If the database
+** is already full, then the caller should invoke lsm_work() or similar
+** until it is not full before creating a new level by flushing the in-memory
+** tree to disk. Limiting the number of levels in the database ensures that
+** the records describing them always fit within the checkpoint blob.
+*/
+int lsmDatabaseFull(lsm_db *pDb){
+ Level *p;
+ int nRhs = 0;
+
+ assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL) );
+ assert( pDb->pWorker );
+
+ for(p=pDb->pWorker->pLevel; p; p=p->pNext){
+ nRhs += (p->nRight ? p->nRight : 1);
+ }
+
+ return (nRhs >= LSM_MAX_RHS_SEGMENTS);
+}
+
+/*
+** The connection passed as the only argument is currently the worker
+** connection. Some work has been performed on the database by the connection,
+** but no new snapshot has been written into shared memory.
+**
+** This function updates the shared-memory worker and client snapshots with
+** the new snapshot produced by the work performed by pDb.
+**
+** If successful, LSM_OK is returned. Otherwise, if an error occurs, an LSM
+** error code is returned.
+*/
+int lsmCheckpointSaveWorker(lsm_db *pDb, int bFlush){
+ Snapshot *pSnap = pDb->pWorker;
+ ShmHeader *pShm = pDb->pShmhdr;
+ void *p = 0;
+ int n = 0;
+ int rc;
+
+ pSnap->iId++;
+ rc = ckptExportSnapshot(pDb, bFlush, pSnap->iId, 1, &p, &n);
+ if( rc!=LSM_OK ) return rc;
+ assert( ckptChecksumOk((u32 *)p) );
+
+ assert( n<=LSM_META_RW_PAGE_SIZE );
+ memcpy(pShm->aSnap2, p, n);
+ lsmShmBarrier(pDb);
+ memcpy(pShm->aSnap1, p, n);
+ lsmFree(pDb->pEnv, p);
+
+ /* assert( lsmFsIntegrityCheck(pDb) ); */
+ return LSM_OK;
+}
+
+/*
+** This function is used to determine the snapshot-id of the most recently
+** checkpointed snapshot. Variable ShmHeader.iMetaPage indicates which of
+** the two meta-pages said snapshot resides on (if any).
+**
+** If successful, this function loads the snapshot from the meta-page,
+** verifies its checksum and sets *piId to the snapshot-id before returning
+** LSM_OK. Or, if the checksum attempt fails, *piId is set to zero and
+** LSM_OK returned. If an error occurs, an LSM error code is returned and
+** the final value of *piId is undefined.
+*/
+int lsmCheckpointSynced(lsm_db *pDb, i64 *piId, i64 *piLog, u32 *pnWrite){
+ int rc = LSM_OK;
+ MetaPage *pPg;
+ u32 iMeta;
+
+ iMeta = pDb->pShmhdr->iMetaPage;
+ if( iMeta==1 || iMeta==2 ){
+ rc = lsmFsMetaPageGet(pDb->pFS, 0, iMeta, &pPg);
+ if( rc==LSM_OK ){
+ int nCkpt;
+ int nData;
+ u8 *aData;
+
+ aData = lsmFsMetaPageData(pPg, &nData);
+ assert( nData==LSM_META_RW_PAGE_SIZE );
+ nCkpt = lsmGetU32(&aData[CKPT_HDR_NCKPT*sizeof(u32)]);
+ if( nCkpt<(LSM_META_RW_PAGE_SIZE/sizeof(u32)) ){
+ u32 *aCopy = lsmMallocRc(pDb->pEnv, sizeof(u32) * nCkpt, &rc);
+ if( aCopy ){
+ memcpy(aCopy, aData, nCkpt*sizeof(u32));
+ ckptChangeEndianness(aCopy, nCkpt);
+ if( ckptChecksumOk(aCopy) ){
+ if( piId ) *piId = lsmCheckpointId(aCopy, 0);
+ if( piLog ) *piLog = (lsmCheckpointLogOffset(aCopy) >> 1);
+ if( pnWrite ) *pnWrite = aCopy[CKPT_HDR_NWRITE];
+ }
+ lsmFree(pDb->pEnv, aCopy);
+ }
+ }
+ lsmFsMetaPageRelease(pPg);
+ }
+ }
+
+ if( (iMeta!=1 && iMeta!=2) || rc!=LSM_OK || pDb->pShmhdr->iMetaPage!=iMeta ){
+ if( piId ) *piId = 0;
+ if( piLog ) *piLog = 0;
+ if( pnWrite ) *pnWrite = 0;
+ }
+ return rc;
+}
+
+/*
+** Return the checkpoint-id of the checkpoint array passed as the first
+** argument to this function. If the second argument is true, then assume
+** that the checkpoint is made up of 32-bit big-endian integers. If it
+** is false, assume that the integers are in machine byte order.
+*/
+i64 lsmCheckpointId(u32 *aCkpt, int bDisk){
+ i64 iId;
+ if( bDisk ){
+ u8 *aData = (u8 *)aCkpt;
+ iId = (((i64)lsmGetU32(&aData[CKPT_HDR_ID_MSW*4])) << 32);
+ iId += ((i64)lsmGetU32(&aData[CKPT_HDR_ID_LSW*4]));
+ }else{
+ iId = ((i64)aCkpt[CKPT_HDR_ID_MSW] << 32) + (i64)aCkpt[CKPT_HDR_ID_LSW];
+ }
+ return iId;
+}
+
+u32 lsmCheckpointNBlock(u32 *aCkpt){
+ return aCkpt[CKPT_HDR_NBLOCK];
+}
+
+u32 lsmCheckpointNWrite(u32 *aCkpt, int bDisk){
+ if( bDisk ){
+ return lsmGetU32((u8 *)&aCkpt[CKPT_HDR_NWRITE]);
+ }else{
+ return aCkpt[CKPT_HDR_NWRITE];
+ }
+}
+
+i64 lsmCheckpointLogOffset(u32 *aCkpt){
+ return ((i64)aCkpt[CKPT_HDR_LO_MSW] << 32) + (i64)aCkpt[CKPT_HDR_LO_LSW];
+}
+
+int lsmCheckpointPgsz(u32 *aCkpt){ return (int)aCkpt[CKPT_HDR_PGSZ]; }
+
+int lsmCheckpointBlksz(u32 *aCkpt){ return (int)aCkpt[CKPT_HDR_BLKSZ]; }
+
+void lsmCheckpointLogoffset(
+ u32 *aCkpt,
+ DbLog *pLog
+){
+ pLog->aRegion[2].iStart = (lsmCheckpointLogOffset(aCkpt) >> 1);
+
+ pLog->cksum0 = aCkpt[CKPT_HDR_LO_CKSUM1];
+ pLog->cksum1 = aCkpt[CKPT_HDR_LO_CKSUM2];
+ pLog->iSnapshotId = lsmCheckpointId(aCkpt, 0);
+}
+
+void lsmCheckpointZeroLogoffset(lsm_db *pDb){
+ u32 nCkpt;
+
+ nCkpt = pDb->aSnapshot[CKPT_HDR_NCKPT];
+ assert( nCkpt>CKPT_HDR_NCKPT );
+ assert( nCkpt==pDb->pShmhdr->aSnap1[CKPT_HDR_NCKPT] );
+ assert( 0==memcmp(pDb->aSnapshot, pDb->pShmhdr->aSnap1, nCkpt*sizeof(u32)) );
+ assert( 0==memcmp(pDb->aSnapshot, pDb->pShmhdr->aSnap2, nCkpt*sizeof(u32)) );
+
+ pDb->aSnapshot[CKPT_HDR_LO_MSW] = 0;
+ pDb->aSnapshot[CKPT_HDR_LO_LSW] = 0;
+ ckptChecksum(pDb->aSnapshot, nCkpt,
+ &pDb->aSnapshot[nCkpt-2], &pDb->aSnapshot[nCkpt-1]
+ );
+
+ memcpy(pDb->pShmhdr->aSnap1, pDb->aSnapshot, nCkpt*sizeof(u32));
+ memcpy(pDb->pShmhdr->aSnap2, pDb->aSnapshot, nCkpt*sizeof(u32));
+}
+
+/*
+** Set the output variable to the number of KB of data written into the
+** database file since the most recent checkpoint.
+*/
+int lsmCheckpointSize(lsm_db *db, int *pnKB){
+ int rc = LSM_OK;
+ u32 nSynced;
+
+ /* Set nSynced to the number of pages that had been written when the
+ ** database was last checkpointed. */
+ rc = lsmCheckpointSynced(db, 0, 0, &nSynced);
+
+ if( rc==LSM_OK ){
+ u32 nPgsz = db->pShmhdr->aSnap1[CKPT_HDR_PGSZ];
+ u32 nWrite = db->pShmhdr->aSnap1[CKPT_HDR_NWRITE];
+ *pnKB = (int)(( ((i64)(nWrite - nSynced) * nPgsz) + 1023) / 1024);
+ }
+
+ return rc;
+}
diff --git a/ext/lsm1/lsm_file.c b/ext/lsm1/lsm_file.c
new file mode 100644
index 0000000..3283c02
--- /dev/null
+++ b/ext/lsm1/lsm_file.c
@@ -0,0 +1,3307 @@
+/*
+** 2011-08-26
+**
+** The author disclaims copyright to this source code. In place of
+** a legal notice, here is a blessing:
+**
+** May you do good and not evil.
+** May you find forgiveness for yourself and forgive others.
+** May you share freely, never taking more than you give.
+**
+*************************************************************************
+**
+** NORMAL DATABASE FILE FORMAT
+**
+** The following database file format concepts are used by the code in
+** this file to read and write the database file.
+**
+** Pages:
+**
+** A database file is divided into pages. The first 8KB of the file consists
+** of two 4KB meta-pages. The meta-page size is not configurable. The
+** remainder of the file is made up of database pages. The default database
+** page size is 4KB. Database pages are aligned to page-size boundaries,
+** so if the database page size is larger than 8KB there is a gap between
+** the end of the meta pages and the start of the database pages.
+**
+** Database pages are numbered based on their position in the file. Page N
+** begins at byte offset ((N-1)*pgsz). This means that page 1 does not
+** exist - since it would always overlap with the meta pages. If the
+** page-size is (say) 512 bytes, then the first usable page in the database
+** is page 33.
+**
+** It is assumed that the first two meta pages and the data that follows
+** them are located on different disk sectors. So that if a power failure
+** while writing to a meta page there is no risk of damage to the other
+** meta page or any other part of the database file. TODO: This may need
+** to be revisited.
+**
+** Blocks:
+**
+** The database file is also divided into blocks. The default block size is
+** 1MB. When writing to the database file, an attempt is made to write data
+** in contiguous block-sized chunks.
+**
+** The first and last page on each block are special in that they are 4
+** bytes smaller than all other pages. This is because the last four bytes
+** of space on the first and last pages of each block are reserved for
+** pointers to other blocks (i.e. a 32-bit block number).
+**
+** Runs:
+**
+** A run is a sequence of pages that the upper layer uses to store a
+** sorted array of database keys (and accompanying data - values, FC
+** pointers and so on). Given a page within a run, it is possible to
+** navigate to the next page in the run as follows:
+**
+** a) if the current page is not the last in a block, the next page
+** in the run is located immediately after the current page, OR
+**
+** b) if the current page is the last page in a block, the next page
+** in the run is the first page on the block identified by the
+** block pointer stored in the last 4 bytes of the current block.
+**
+** It is possible to navigate to the previous page in a similar fashion,
+** using the block pointer embedded in the last 4 bytes of the first page
+** of each block as required.
+**
+** The upper layer is responsible for identifying by page number the
+** first and last page of any run that it needs to navigate - there are
+** no "end-of-run" markers stored or identified by this layer. This is
+** necessary as clients reading different database snapshots may access
+** different subsets of a run.
+**
+** THE LOG FILE
+**
+** This file opens and closes the log file. But it does not contain any
+** logic related to the log file format. Instead, it exports the following
+** functions that are used by the code in lsm_log.c to read and write the
+** log file:
+**
+** lsmFsOpenLog
+** lsmFsWriteLog
+** lsmFsSyncLog
+** lsmFsReadLog
+** lsmFsTruncateLog
+** lsmFsCloseAndDeleteLog
+**
+** COMPRESSED DATABASE FILE FORMAT
+**
+** The compressed database file format is very similar to the normal format.
+** The file still begins with two 4KB meta-pages (which are never compressed).
+** It is still divided into blocks.
+**
+** The first and last four bytes of each block are reserved for 32-bit
+** pointer values. Similar to the way four bytes are carved from the end of
+** the first and last page of each block in uncompressed databases. From
+** the point of view of the upper layer, all pages are the same size - this
+** is different from the uncompressed format where the first and last pages
+** on each block are 4 bytes smaller than the others.
+**
+** Pages are stored in variable length compressed form, as follows:
+**
+** * 3-byte size field containing the size of the compressed page image
+** in bytes. The most significant bit of each byte of the size field
+** is always set. The remaining 7 bits are used to store a 21-bit
+** integer value (in big-endian order - the first byte in the field
+** contains the most significant 7 bits). Since the maximum allowed
+** size of a compressed page image is (2^17 - 1) bytes, there are
+** actually 4 unused bits in the size field.
+**
+** In other words, if the size of the compressed page image is nSz,
+** the header can be serialized as follows:
+**
+** u8 aHdr[3]
+** aHdr[0] = 0x80 | (u8)(nSz >> 14);
+** aHdr[1] = 0x80 | (u8)(nSz >> 7);
+** aHdr[2] = 0x80 | (u8)(nSz >> 0);
+**
+** * Compressed page image.
+**
+** * A second copy of the 3-byte record header.
+**
+** A page number is a byte offset into the database file. So the smallest
+** possible page number is 8192 (immediately after the two meta-pages).
+** The first and root page of a segment are identified by a page number
+** corresponding to the byte offset of the first byte in the corresponding
+** page record. The last page of a segment is identified by the byte offset
+** of the last byte in its record.
+**
+** Unlike uncompressed pages, compressed page records may span blocks.
+**
+** Sometimes, in order to avoid touching sectors that contain synced data
+** when writing, it is necessary to insert unused space between compressed
+** page records. This can be done as follows:
+**
+** * For less than 6 bytes of empty space, the first and last byte
+** of the free space contain the total number of free bytes. For
+** example:
+**
+** Block of 4 free bytes: 0x04 0x?? 0x?? 0x04
+** Block of 2 free bytes: 0x02 0x02
+** A single free byte: 0x01
+**
+** * For 6 or more bytes of empty space, a record similar to a
+** compressed page record is added to the segment. A padding record
+** is distinguished from a compressed page record by the most
+** significant bit of the second byte of the size field, which is
+** cleared instead of set.
+*/
+#include "lsmInt.h"
+
+#include
+#include
+#include
+
+/*
+** File-system object. Each database connection allocates a single instance
+** of the following structure. It is used for all access to the database and
+** log files.
+**
+** The database file may be accessed via two methods - using mmap() or using
+** read() and write() calls. In the general case both methods are used - a
+** prefix of the file is mapped into memory and the remainder accessed using
+** read() and write(). This is helpful when accessing very large files (or
+** files that may grow very large during the lifetime of a database
+** connection) on systems with 32-bit address spaces. However, it also requires
+** that this object manage two distinct types of Page objects simultaneously -
+** those that carry pointers to the mapped file and those that carry arrays
+** populated by read() calls.
+**
+** pFree:
+** The head of a singly-linked list that containing currently unused Page
+** structures suitable for use as mmap-page handles. Connected by the
+** Page.pFreeNext pointers.
+**
+** pMapped:
+** The head of a singly-linked list that contains all pages that currently
+** carry pointers to the mapped region. This is used if the region is
+** every remapped - the pointers carried by existing pages can be adjusted
+** to account for the remapping. Connected by the Page.pMappedNext pointers.
+**
+** pWaiting:
+** When the upper layer wishes to append a new b-tree page to a segment,
+** it allocates a Page object that carries a malloc'd block of memory -
+** regardless of the mmap-related configuration. The page is not assigned
+** a page number at first. When the upper layer has finished constructing
+** the page contents, it calls lsmFsPagePersist() to assign a page number
+** to it. At this point it is likely that N pages have been written to the
+** segment, the (N+1)th page is still outstanding and the b-tree page is
+** assigned page number (N+2). To avoid writing page (N+2) before page
+** (N+1), the recently completed b-tree page is held in the singly linked
+** list headed by pWaiting until page (N+1) has been written.
+**
+** Function lsmFsFlushWaiting() is responsible for eventually writing
+** waiting pages to disk.
+**
+** apHash/nHash:
+** Hash table used to store all Page objects that carry malloc'd arrays,
+** except those b-tree pages that have not yet been assigned page numbers.
+** Once they have been assigned page numbers - they are added to this
+** hash table.
+**
+** Hash table overflow chains are connected using the Page.pHashNext
+** pointers.
+**
+** pLruFirst, pLruLast:
+** The first and last entries in a doubly-linked list of pages. This
+** list contains all pages with malloc'd data that are present in the
+** hash table and have a ref-count of zero.
+*/
+struct FileSystem {
+ lsm_db *pDb; /* Database handle that owns this object */
+ lsm_env *pEnv; /* Environment pointer */
+ char *zDb; /* Database file name */
+ char *zLog; /* Database file name */
+ int nMetasize; /* Size of meta pages in bytes */
+ int nMetaRwSize; /* Read/written size of meta pages in bytes */
+ int nPagesize; /* Database page-size in bytes */
+ int nBlocksize; /* Database block-size in bytes */
+
+ /* r/w file descriptors for both files. */
+ LsmFile *pLsmFile; /* Used after lsm_close() to link into list */
+ lsm_file *fdDb; /* Database file */
+ lsm_file *fdLog; /* Log file */
+ int szSector; /* Database file sector size */
+
+ /* If this is a compressed database, a pointer to the compression methods.
+ ** For an uncompressed database, a NULL pointer. */
+ lsm_compress *pCompress;
+ u8 *aIBuffer; /* Buffer to compress to */
+ u8 *aOBuffer; /* Buffer to uncompress from */
+ int nBuffer; /* Allocated size of above buffers in bytes */
+
+ /* mmap() page related things */
+ i64 nMapLimit; /* Maximum bytes of file to map */
+ void *pMap; /* Current mapping of database file */
+ i64 nMap; /* Bytes mapped at pMap */
+ Page *pFree; /* Unused Page structures */
+ Page *pMapped; /* List of Page structs that point to pMap */
+
+ /* Page cache parameters for non-mmap() pages */
+ int nCacheMax; /* Configured cache size (in pages) */
+ int nCacheAlloc; /* Current cache size (in pages) */
+ Page *pLruFirst; /* Head of the LRU list */
+ Page *pLruLast; /* Tail of the LRU list */
+ int nHash; /* Number of hash slots in hash table */
+ Page **apHash; /* nHash Hash slots */
+ Page *pWaiting; /* b-tree pages waiting to be written */
+
+ /* Statistics */
+ int nOut; /* Number of outstanding pages */
+ int nWrite; /* Total number of pages written */
+ int nRead; /* Total number of pages read */
+};
+
+/*
+** Database page handle.
+**
+** pSeg:
+** When lsmFsSortedAppend() is called on a compressed database, the new
+** page is not assigned a page number or location in the database file
+** immediately. Instead, these are assigned by the lsmFsPagePersist() call
+** right before it writes the compressed page image to disk.
+**
+** The lsmFsSortedAppend() function sets the pSeg pointer to point to the
+** segment that the new page will be a part of. It is unset by
+** lsmFsPagePersist() after the page is written to disk.
+*/
+struct Page {
+ u8 *aData; /* Buffer containing page data */
+ int nData; /* Bytes of usable data at aData[] */
+ Pgno iPg; /* Page number */
+ int nRef; /* Number of outstanding references */
+ int flags; /* Combination of PAGE_XXX flags */
+ Page *pHashNext; /* Next page in hash table slot */
+ Page *pLruNext; /* Next page in LRU list */
+ Page *pLruPrev; /* Previous page in LRU list */
+ FileSystem *pFS; /* File system that owns this page */
+
+ /* Only used in compressed database mode: */
+ int nCompress; /* Compressed size (or 0 for uncomp. db) */
+ int nCompressPrev; /* Compressed size of prev page */
+ Segment *pSeg; /* Segment this page will be written to */
+
+ /* Pointers for singly linked lists */
+ Page *pWaitingNext; /* Next page in FileSystem.pWaiting list */
+ Page *pFreeNext; /* Next page in FileSystem.pFree list */
+ Page *pMappedNext; /* Next page in FileSystem.pMapped list */
+};
+
+/*
+** Meta-data page handle. There are two meta-data pages at the start of
+** the database file, each FileSystem.nMetasize bytes in size.
+*/
+struct MetaPage {
+ int iPg; /* Either 1 or 2 */
+ int bWrite; /* Write back to db file on release */
+ u8 *aData; /* Pointer to buffer */
+ FileSystem *pFS; /* FileSystem that owns this page */
+};
+
+/*
+** Values for LsmPage.flags
+*/
+#define PAGE_DIRTY 0x00000001 /* Set if page is dirty */
+#define PAGE_FREE 0x00000002 /* Set if Page.aData requires lsmFree() */
+#define PAGE_HASPREV 0x00000004 /* Set if page is first on uncomp. block */
+
+/*
+** Number of pgsz byte pages omitted from the start of block 1. The start
+** of block 1 contains two 4096 byte meta pages (8192 bytes in total).
+*/
+#define BLOCK1_HDR_SIZE(pgsz) LSM_MAX(1, 8192/(pgsz))
+
+/*
+** If NDEBUG is not defined, set a breakpoint in function lsmIoerrBkpt()
+** to catch IO errors (any error returned by a VFS method).
+*/
+#ifndef NDEBUG
+static void lsmIoerrBkpt(void){
+ static int nErr = 0;
+ nErr++;
+}
+static int IOERR_WRAPPER(int rc){
+ if( rc!=LSM_OK ) lsmIoerrBkpt();
+ return rc;
+}
+#else
+# define IOERR_WRAPPER(rc) (rc)
+#endif
+
+#ifdef NDEBUG
+# define assert_lists_are_ok(x)
+#else
+static Page *fsPageFindInHash(FileSystem *pFS, Pgno iPg, int *piHash);
+
+static void assert_lists_are_ok(FileSystem *pFS){
+#if 0
+ Page *p;
+
+ assert( pFS->nMapLimit>=0 );
+
+ /* Check that all pages in the LRU list have nRef==0, pointers to buffers
+ ** in heap memory, and corresponding entries in the hash table. */
+ for(p=pFS->pLruFirst; p; p=p->pLruNext){
+ assert( p==pFS->pLruFirst || p->pLruPrev!=0 );
+ assert( p==pFS->pLruLast || p->pLruNext!=0 );
+ assert( p->pLruPrev==0 || p->pLruPrev->pLruNext==p );
+ assert( p->pLruNext==0 || p->pLruNext->pLruPrev==p );
+ assert( p->nRef==0 );
+ assert( p->flags & PAGE_FREE );
+ assert( p==fsPageFindInHash(pFS, p->iPg, 0) );
+ }
+#endif
+}
+#endif
+
+/*
+** Wrappers around the VFS methods of the lsm_env object:
+**
+** lsmEnvOpen()
+** lsmEnvRead()
+** lsmEnvWrite()
+** lsmEnvSync()
+** lsmEnvSectorSize()
+** lsmEnvClose()
+** lsmEnvTruncate()
+** lsmEnvUnlink()
+** lsmEnvRemap()
+*/
+int lsmEnvOpen(lsm_env *pEnv, const char *zFile, int flags, lsm_file **ppNew){
+ return pEnv->xOpen(pEnv, zFile, flags, ppNew);
+}
+
+static int lsmEnvRead(
+ lsm_env *pEnv,
+ lsm_file *pFile,
+ lsm_i64 iOff,
+ void *pRead,
+ int nRead
+){
+ return IOERR_WRAPPER( pEnv->xRead(pFile, iOff, pRead, nRead) );
+}
+
+static int lsmEnvWrite(
+ lsm_env *pEnv,
+ lsm_file *pFile,
+ lsm_i64 iOff,
+ const void *pWrite,
+ int nWrite
+){
+ return IOERR_WRAPPER( pEnv->xWrite(pFile, iOff, (void *)pWrite, nWrite) );
+}
+
+static int lsmEnvSync(lsm_env *pEnv, lsm_file *pFile){
+ return IOERR_WRAPPER( pEnv->xSync(pFile) );
+}
+
+static int lsmEnvSectorSize(lsm_env *pEnv, lsm_file *pFile){
+ return pEnv->xSectorSize(pFile);
+}
+
+int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile){
+ return IOERR_WRAPPER( pEnv->xClose(pFile) );
+}
+
+static int lsmEnvTruncate(lsm_env *pEnv, lsm_file *pFile, lsm_i64 nByte){
+ return IOERR_WRAPPER( pEnv->xTruncate(pFile, nByte) );
+}
+
+static int lsmEnvUnlink(lsm_env *pEnv, const char *zDel){
+ return IOERR_WRAPPER( pEnv->xUnlink(pEnv, zDel) );
+}
+
+static int lsmEnvRemap(
+ lsm_env *pEnv,
+ lsm_file *pFile,
+ i64 szMin,
+ void **ppMap,
+ i64 *pszMap
+){
+ return pEnv->xRemap(pFile, szMin, ppMap, pszMap);
+}
+
+int lsmEnvLock(lsm_env *pEnv, lsm_file *pFile, int iLock, int eLock){
+ if( pFile==0 ) return LSM_OK;
+ return pEnv->xLock(pFile, iLock, eLock);
+}
+
+int lsmEnvTestLock(
+ lsm_env *pEnv,
+ lsm_file *pFile,
+ int iLock,
+ int nLock,
+ int eLock
+){
+ return pEnv->xTestLock(pFile, iLock, nLock, eLock);
+}
+
+int lsmEnvShmMap(
+ lsm_env *pEnv,
+ lsm_file *pFile,
+ int iChunk,
+ int sz,
+ void **ppOut
+){
+ return pEnv->xShmMap(pFile, iChunk, sz, ppOut);
+}
+
+void lsmEnvShmBarrier(lsm_env *pEnv){
+ pEnv->xShmBarrier();
+}
+
+void lsmEnvShmUnmap(lsm_env *pEnv, lsm_file *pFile, int bDel){
+ pEnv->xShmUnmap(pFile, bDel);
+}
+
+void lsmEnvSleep(lsm_env *pEnv, int nUs){
+ pEnv->xSleep(pEnv, nUs);
+}
+
+
+/*
+** Write the contents of string buffer pStr into the log file, starting at
+** offset iOff.
+*/
+int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr){
+ assert( pFS->fdLog );
+ return lsmEnvWrite(pFS->pEnv, pFS->fdLog, iOff, pStr->z, pStr->n);
+}
+
+/*
+** fsync() the log file.
+*/
+int lsmFsSyncLog(FileSystem *pFS){
+ assert( pFS->fdLog );
+ return lsmEnvSync(pFS->pEnv, pFS->fdLog);
+}
+
+/*
+** Read nRead bytes of data starting at offset iOff of the log file. Append
+** the results to string buffer pStr.
+*/
+int lsmFsReadLog(FileSystem *pFS, i64 iOff, int nRead, LsmString *pStr){
+ int rc; /* Return code */
+ assert( pFS->fdLog );
+ rc = lsmStringExtend(pStr, nRead);
+ if( rc==LSM_OK ){
+ rc = lsmEnvRead(pFS->pEnv, pFS->fdLog, iOff, &pStr->z[pStr->n], nRead);
+ pStr->n += nRead;
+ }
+ return rc;
+}
+
+/*
+** Truncate the log file to nByte bytes in size.
+*/
+int lsmFsTruncateLog(FileSystem *pFS, i64 nByte){
+ if( pFS->fdLog==0 ) return LSM_OK;
+ return lsmEnvTruncate(pFS->pEnv, pFS->fdLog, nByte);
+}
+
+/*
+** Truncate the db file to nByte bytes in size.
+*/
+int lsmFsTruncateDb(FileSystem *pFS, i64 nByte){
+ if( pFS->fdDb==0 ) return LSM_OK;
+ return lsmEnvTruncate(pFS->pEnv, pFS->fdDb, nByte);
+}
+
+/*
+** Close the log file. Then delete it from the file-system. This function
+** is called during database shutdown only.
+*/
+int lsmFsCloseAndDeleteLog(FileSystem *pFS){
+ char *zDel;
+
+ if( pFS->fdLog ){
+ lsmEnvClose(pFS->pEnv, pFS->fdLog );
+ pFS->fdLog = 0;
+ }
+
+ zDel = lsmMallocPrintf(pFS->pEnv, "%s-log", pFS->zDb);
+ if( zDel ){
+ lsmEnvUnlink(pFS->pEnv, zDel);
+ lsmFree(pFS->pEnv, zDel);
+ }
+ return LSM_OK;
+}
+
+/*
+** Return true if page iReal of the database should be accessed using mmap.
+** False otherwise.
+*/
+static int fsMmapPage(FileSystem *pFS, Pgno iReal){
+ return ((i64)iReal*pFS->nPagesize <= pFS->nMapLimit);
+}
+
+/*
+** Given that there are currently nHash slots in the hash table, return
+** the hash key for file iFile, page iPg.
+*/
+static int fsHashKey(int nHash, Pgno iPg){
+ return (iPg % nHash);
+}
+
+/*
+** This is a helper function for lsmFsOpen(). It opens a single file on
+** disk (either the database or log file).
+*/
+static lsm_file *fsOpenFile(
+ FileSystem *pFS, /* File system object */
+ int bReadonly, /* True to open this file read-only */
+ int bLog, /* True for log, false for db */
+ int *pRc /* IN/OUT: Error code */
+){
+ lsm_file *pFile = 0;
+ if( *pRc==LSM_OK ){
+ int flags = (bReadonly ? LSM_OPEN_READONLY : 0);
+ const char *zPath = (bLog ? pFS->zLog : pFS->zDb);
+
+ *pRc = lsmEnvOpen(pFS->pEnv, zPath, flags, &pFile);
+ }
+ return pFile;
+}
+
+/*
+** If it is not already open, this function opens the log file. It returns
+** LSM_OK if successful (or if the log file was already open) or an LSM
+** error code otherwise.
+**
+** The log file must be opened before any of the following may be called:
+**
+** lsmFsWriteLog
+** lsmFsSyncLog
+** lsmFsReadLog
+*/
+int lsmFsOpenLog(lsm_db *db, int *pbOpen){
+ int rc = LSM_OK;
+ FileSystem *pFS = db->pFS;
+
+ if( 0==pFS->fdLog ){
+ pFS->fdLog = fsOpenFile(pFS, db->bReadonly, 1, &rc);
+
+ if( rc==LSM_IOERR_NOENT && db->bReadonly ){
+ rc = LSM_OK;
+ }
+ }
+
+ if( pbOpen ) *pbOpen = (pFS->fdLog!=0);
+ return rc;
+}
+
+/*
+** Close the log file, if it is open.
+*/
+void lsmFsCloseLog(lsm_db *db){
+ FileSystem *pFS = db->pFS;
+ if( pFS->fdLog ){
+ lsmEnvClose(pFS->pEnv, pFS->fdLog);
+ pFS->fdLog = 0;
+ }
+}
+
+/*
+** Open a connection to a database stored within the file-system.
+**
+** If parameter bReadonly is true, then open a read-only file-descriptor
+** on the database file. It is possible that bReadonly will be false even
+** if the user requested that pDb be opened read-only. This is because the
+** file-descriptor may later on be recycled by a read-write connection.
+** If the db file can be opened for read-write access, it always is. Parameter
+** bReadonly is only ever true if it has already been determined that the
+** db can only be opened for read-only access.
+**
+** Return LSM_OK if successful or an lsm error code otherwise.
+*/
+int lsmFsOpen(
+ lsm_db *pDb, /* Database connection to open fd for */
+ const char *zDb, /* Full path to database file */
+ int bReadonly /* True to open db file read-only */
+){
+ FileSystem *pFS;
+ int rc = LSM_OK;
+ int nDb = strlen(zDb);
+ int nByte;
+
+ assert( pDb->pFS==0 );
+ assert( pDb->pWorker==0 && pDb->pClient==0 );
+
+ nByte = sizeof(FileSystem) + nDb+1 + nDb+4+1;
+ pFS = (FileSystem *)lsmMallocZeroRc(pDb->pEnv, nByte, &rc);
+ if( pFS ){
+ LsmFile *pLsmFile;
+ pFS->zDb = (char *)&pFS[1];
+ pFS->zLog = &pFS->zDb[nDb+1];
+ pFS->nPagesize = LSM_DFLT_PAGE_SIZE;
+ pFS->nBlocksize = LSM_DFLT_BLOCK_SIZE;
+ pFS->nMetasize = LSM_META_PAGE_SIZE;
+ pFS->nMetaRwSize = LSM_META_RW_PAGE_SIZE;
+ pFS->pDb = pDb;
+ pFS->pEnv = pDb->pEnv;
+
+ /* Make a copy of the database and log file names. */
+ memcpy(pFS->zDb, zDb, nDb+1);
+ memcpy(pFS->zLog, zDb, nDb);
+ memcpy(&pFS->zLog[nDb], "-log", 5);
+
+ /* Allocate the hash-table here. At some point, it should be changed
+ ** so that it can grow dynamicly. */
+ pFS->nCacheMax = 2048*1024 / pFS->nPagesize;
+ pFS->nHash = 4096;
+ pFS->apHash = lsmMallocZeroRc(pDb->pEnv, sizeof(Page *) * pFS->nHash, &rc);
+
+ /* Open the database file */
+ pLsmFile = lsmDbRecycleFd(pDb);
+ if( pLsmFile ){
+ pFS->pLsmFile = pLsmFile;
+ pFS->fdDb = pLsmFile->pFile;
+ memset(pLsmFile, 0, sizeof(LsmFile));
+ }else{
+ pFS->pLsmFile = lsmMallocZeroRc(pDb->pEnv, sizeof(LsmFile), &rc);
+ if( rc==LSM_OK ){
+ pFS->fdDb = fsOpenFile(pFS, bReadonly, 0, &rc);
+ }
+ }
+
+ if( rc!=LSM_OK ){
+ lsmFsClose(pFS);
+ pFS = 0;
+ }else{
+ pFS->szSector = lsmEnvSectorSize(pFS->pEnv, pFS->fdDb);
+ }
+ }
+
+ pDb->pFS = pFS;
+ return rc;
+}
+
+/*
+** Configure the file-system object according to the current values of
+** the LSM_CONFIG_MMAP and LSM_CONFIG_SET_COMPRESSION options.
+*/
+int lsmFsConfigure(lsm_db *db){
+ FileSystem *pFS = db->pFS;
+ if( pFS ){
+ lsm_env *pEnv = pFS->pEnv;
+ Page *pPg;
+
+ assert( pFS->nOut==0 );
+ assert( pFS->pWaiting==0 );
+ assert( pFS->pMapped==0 );
+
+ /* Reset any compression/decompression buffers already allocated */
+ lsmFree(pEnv, pFS->aIBuffer);
+ lsmFree(pEnv, pFS->aOBuffer);
+ pFS->nBuffer = 0;
+
+ /* Unmap the file, if it is currently mapped */
+ if( pFS->pMap ){
+ lsmEnvRemap(pEnv, pFS->fdDb, -1, &pFS->pMap, &pFS->nMap);
+ pFS->nMapLimit = 0;
+ }
+
+ /* Free all allocated page structures */
+ pPg = pFS->pLruFirst;
+ while( pPg ){
+ Page *pNext = pPg->pLruNext;
+ assert( pPg->flags & PAGE_FREE );
+ lsmFree(pEnv, pPg->aData);
+ lsmFree(pEnv, pPg);
+ pPg = pNext;
+ }
+
+ pPg = pFS->pFree;
+ while( pPg ){
+ Page *pNext = pPg->pFreeNext;
+ lsmFree(pEnv, pPg);
+ pPg = pNext;
+ }
+
+ /* Zero pointers that point to deleted page objects */
+ pFS->nCacheAlloc = 0;
+ pFS->pLruFirst = 0;
+ pFS->pLruLast = 0;
+ pFS->pFree = 0;
+ if( pFS->apHash ){
+ memset(pFS->apHash, 0, pFS->nHash*sizeof(pFS->apHash[0]));
+ }
+
+ /* Configure the FileSystem object */
+ if( db->compress.xCompress ){
+ pFS->pCompress = &db->compress;
+ pFS->nMapLimit = 0;
+ }else{
+ pFS->pCompress = 0;
+ if( db->iMmap==1 ){
+ /* Unlimited */
+ pFS->nMapLimit = (i64)1 << 60;
+ }else{
+ /* iMmap is a limit in KB. Set nMapLimit to the same value in bytes. */
+ pFS->nMapLimit = (i64)db->iMmap * 1024;
+ }
+ }
+ }
+
+ return LSM_OK;
+}
+
+/*
+** Close and destroy a FileSystem object.
+*/
+void lsmFsClose(FileSystem *pFS){
+ if( pFS ){
+ Page *pPg;
+ lsm_env *pEnv = pFS->pEnv;
+
+ assert( pFS->nOut==0 );
+ pPg = pFS->pLruFirst;
+ while( pPg ){
+ Page *pNext = pPg->pLruNext;
+ if( pPg->flags & PAGE_FREE ) lsmFree(pEnv, pPg->aData);
+ lsmFree(pEnv, pPg);
+ pPg = pNext;
+ }
+
+ pPg = pFS->pFree;
+ while( pPg ){
+ Page *pNext = pPg->pFreeNext;
+ if( pPg->flags & PAGE_FREE ) lsmFree(pEnv, pPg->aData);
+ lsmFree(pEnv, pPg);
+ pPg = pNext;
+ }
+
+ if( pFS->fdDb ) lsmEnvClose(pFS->pEnv, pFS->fdDb );
+ if( pFS->fdLog ) lsmEnvClose(pFS->pEnv, pFS->fdLog );
+ lsmFree(pEnv, pFS->pLsmFile);
+ lsmFree(pEnv, pFS->apHash);
+ lsmFree(pEnv, pFS->aIBuffer);
+ lsmFree(pEnv, pFS->aOBuffer);
+ lsmFree(pEnv, pFS);
+ }
+}
+
+/*
+** This function is called when closing a database handle (i.e. lsm_close())
+** if there exist other connections to the same database within this process.
+** In that case the file-descriptor open on the database file is not closed
+** when the FileSystem object is destroyed, as this would cause any POSIX
+** locks held by the other connections to be silently dropped (see "man close"
+** for details). Instead, the file-descriptor is stored in a list by the
+** lsm_shared.c module until it is either closed or reused.
+**
+** This function returns a pointer to an object that can be linked into
+** the list described above. The returned object now 'owns' the database
+** file descriptr, so that when the FileSystem object is destroyed, it
+** will not be closed.
+**
+** This function may be called at most once in the life-time of a
+** FileSystem object. The results of any operations involving the database
+** file descriptor are undefined once this function has been called.
+**
+** None of this is necessary on non-POSIX systems. But we do it anyway in
+** the name of using as similar code as possible on all platforms.
+*/
+LsmFile *lsmFsDeferClose(FileSystem *pFS){
+ LsmFile *p = pFS->pLsmFile;
+ assert( p->pNext==0 );
+ p->pFile = pFS->fdDb;
+ pFS->fdDb = 0;
+ pFS->pLsmFile = 0;
+ return p;
+}
+
+/*
+** Allocate a buffer and populate it with the output of the xFileid()
+** method of the database file handle. If successful, set *ppId to point
+** to the buffer and *pnId to the number of bytes in the buffer and return
+** LSM_OK. Otherwise, set *ppId and *pnId to zero and return an LSM
+** error code.
+*/
+int lsmFsFileid(lsm_db *pDb, void **ppId, int *pnId){
+ lsm_env *pEnv = pDb->pEnv;
+ FileSystem *pFS = pDb->pFS;
+ int rc;
+ int nId = 0;
+ void *pId;
+
+ rc = pEnv->xFileid(pFS->fdDb, 0, &nId);
+ pId = lsmMallocZeroRc(pEnv, nId, &rc);
+ if( rc==LSM_OK ) rc = pEnv->xFileid(pFS->fdDb, pId, &nId);
+
+ if( rc!=LSM_OK ){
+ lsmFree(pEnv, pId);
+ pId = 0;
+ nId = 0;
+ }
+
+ *ppId = pId;
+ *pnId = nId;
+ return rc;
+}
+
+/*
+** Return the nominal page-size used by this file-system. Actual pages
+** may be smaller or larger than this value.
+*/
+int lsmFsPageSize(FileSystem *pFS){
+ return pFS->nPagesize;
+}
+
+/*
+** Return the block-size used by this file-system.
+*/
+int lsmFsBlockSize(FileSystem *pFS){
+ return pFS->nBlocksize;
+}
+
+/*
+** Configure the nominal page-size used by this file-system. Actual
+** pages may be smaller or larger than this value.
+*/
+void lsmFsSetPageSize(FileSystem *pFS, int nPgsz){
+ pFS->nPagesize = nPgsz;
+ pFS->nCacheMax = 2048*1024 / pFS->nPagesize;
+}
+
+/*
+** Configure the block-size used by this file-system.
+*/
+void lsmFsSetBlockSize(FileSystem *pFS, int nBlocksize){
+ pFS->nBlocksize = nBlocksize;
+}
+
+/*
+** Return the page number of the first page on block iBlock. Blocks are
+** numbered starting from 1.
+**
+** For a compressed database, page numbers are byte offsets. The first
+** page on each block is the byte offset immediately following the 4-byte
+** "previous block" pointer at the start of each block.
+*/
+static Pgno fsFirstPageOnBlock(FileSystem *pFS, int iBlock){
+ Pgno iPg;
+ if( pFS->pCompress ){
+ if( iBlock==1 ){
+ iPg = pFS->nMetasize * 2 + 4;
+ }else{
+ iPg = pFS->nBlocksize * (Pgno)(iBlock-1) + 4;
+ }
+ }else{
+ const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
+ if( iBlock==1 ){
+ iPg = 1 + ((pFS->nMetasize*2 + pFS->nPagesize - 1) / pFS->nPagesize);
+ }else{
+ iPg = 1 + (iBlock-1) * nPagePerBlock;
+ }
+ }
+ return iPg;
+}
+
+/*
+** Return the page number of the last page on block iBlock. Blocks are
+** numbered starting from 1.
+**
+** For a compressed database, page numbers are byte offsets. The first
+** page on each block is the byte offset of the byte immediately before
+** the 4-byte "next block" pointer at the end of each block.
+*/
+static Pgno fsLastPageOnBlock(FileSystem *pFS, int iBlock){
+ if( pFS->pCompress ){
+ return pFS->nBlocksize * (Pgno)iBlock - 1 - 4;
+ }else{
+ const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
+ return iBlock * nPagePerBlock;
+ }
+}
+
+/*
+** Return the block number of the block that page iPg is located on.
+** Blocks are numbered starting from 1.
+*/
+static int fsPageToBlock(FileSystem *pFS, Pgno iPg){
+ if( pFS->pCompress ){
+ return (int)((iPg / pFS->nBlocksize) + 1);
+ }else{
+ return (int)(1 + ((iPg-1) / (pFS->nBlocksize / pFS->nPagesize)));
+ }
+}
+
+/*
+** Return true if page iPg is the last page on its block.
+**
+** This function is only called in non-compressed database mode.
+*/
+static int fsIsLast(FileSystem *pFS, Pgno iPg){
+ const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
+ assert( !pFS->pCompress );
+ return ( iPg && (iPg % nPagePerBlock)==0 );
+}
+
+/*
+** Return true if page iPg is the first page on its block.
+**
+** This function is only called in non-compressed database mode.
+*/
+static int fsIsFirst(FileSystem *pFS, Pgno iPg){
+ const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
+ assert( !pFS->pCompress );
+ return ( (iPg % nPagePerBlock)==1
+ || (iPgnData;
+ }
+ return pPage->aData;
+}
+
+/*
+** Return the page number of a page.
+*/
+Pgno lsmFsPageNumber(Page *pPage){
+ /* assert( (pPage->flags & PAGE_DIRTY)==0 ); */
+ return pPage ? pPage->iPg : 0;
+}
+
+/*
+** Page pPg is currently part of the LRU list belonging to pFS. Remove
+** it from the list. pPg->pLruNext and pPg->pLruPrev are cleared by this
+** operation.
+*/
+static void fsPageRemoveFromLru(FileSystem *pFS, Page *pPg){
+ assert( pPg->pLruNext || pPg==pFS->pLruLast );
+ assert( pPg->pLruPrev || pPg==pFS->pLruFirst );
+ if( pPg->pLruNext ){
+ pPg->pLruNext->pLruPrev = pPg->pLruPrev;
+ }else{
+ pFS->pLruLast = pPg->pLruPrev;
+ }
+ if( pPg->pLruPrev ){
+ pPg->pLruPrev->pLruNext = pPg->pLruNext;
+ }else{
+ pFS->pLruFirst = pPg->pLruNext;
+ }
+ pPg->pLruPrev = 0;
+ pPg->pLruNext = 0;
+}
+
+/*
+** Page pPg is not currently part of the LRU list belonging to pFS. Add it.
+*/
+static void fsPageAddToLru(FileSystem *pFS, Page *pPg){
+ assert( pPg->pLruNext==0 && pPg->pLruPrev==0 );
+ pPg->pLruPrev = pFS->pLruLast;
+ if( pPg->pLruPrev ){
+ pPg->pLruPrev->pLruNext = pPg;
+ }else{
+ pFS->pLruFirst = pPg;
+ }
+ pFS->pLruLast = pPg;
+}
+
+/*
+** Page pPg is currently stored in the apHash/nHash hash table. Remove it.
+*/
+static void fsPageRemoveFromHash(FileSystem *pFS, Page *pPg){
+ int iHash;
+ Page **pp;
+
+ iHash = fsHashKey(pFS->nHash, pPg->iPg);
+ for(pp=&pFS->apHash[iHash]; *pp!=pPg; pp=&(*pp)->pHashNext);
+ *pp = pPg->pHashNext;
+ pPg->pHashNext = 0;
+}
+
+/*
+** Free a Page object allocated by fsPageBuffer().
+*/
+static void fsPageBufferFree(Page *pPg){
+ pPg->pFS->nCacheAlloc--;
+ lsmFree(pPg->pFS->pEnv, pPg->aData);
+ lsmFree(pPg->pFS->pEnv, pPg);
+}
+
+
+/*
+** Purge the cache of all non-mmap pages with nRef==0.
+*/
+void lsmFsPurgeCache(FileSystem *pFS){
+ Page *pPg;
+
+ pPg = pFS->pLruFirst;
+ while( pPg ){
+ Page *pNext = pPg->pLruNext;
+ assert( pPg->flags & PAGE_FREE );
+ fsPageRemoveFromHash(pFS, pPg);
+ fsPageBufferFree(pPg);
+ pPg = pNext;
+ }
+ pFS->pLruFirst = 0;
+ pFS->pLruLast = 0;
+
+ assert( pFS->nCacheAlloc<=pFS->nOut && pFS->nCacheAlloc>=0 );
+}
+
+/*
+** Search the hash-table for page iPg. If an entry is round, return a pointer
+** to it. Otherwise, return NULL.
+**
+** Either way, if argument piHash is not NULL set *piHash to the hash slot
+** number that page iPg would be stored in before returning.
+*/
+static Page *fsPageFindInHash(FileSystem *pFS, Pgno iPg, int *piHash){
+ Page *p; /* Return value */
+ int iHash = fsHashKey(pFS->nHash, iPg);
+
+ if( piHash ) *piHash = iHash;
+ for(p=pFS->apHash[iHash]; p; p=p->pHashNext){
+ if( p->iPg==iPg) break;
+ }
+ return p;
+}
+
+/*
+** Allocate and return a non-mmap Page object. If there are already
+** nCacheMax such Page objects outstanding, try to recycle an existing
+** Page instead.
+*/
+static int fsPageBuffer(
+ FileSystem *pFS,
+ Page **ppOut
+){
+ int rc = LSM_OK;
+ Page *pPage = 0;
+ if( pFS->pLruFirst==0 || pFS->nCacheAllocnCacheMax ){
+ /* Allocate a new Page object */
+ pPage = lsmMallocZero(pFS->pEnv, sizeof(Page));
+ if( !pPage ){
+ rc = LSM_NOMEM_BKPT;
+ }else{
+ pPage->aData = (u8 *)lsmMalloc(pFS->pEnv, pFS->nPagesize);
+ if( !pPage->aData ){
+ lsmFree(pFS->pEnv, pPage);
+ rc = LSM_NOMEM_BKPT;
+ pPage = 0;
+ }else{
+ pFS->nCacheAlloc++;
+ }
+ }
+ }else{
+ /* Reuse an existing Page object */
+ u8 *aData;
+ pPage = pFS->pLruFirst;
+ aData = pPage->aData;
+ fsPageRemoveFromLru(pFS, pPage);
+ fsPageRemoveFromHash(pFS, pPage);
+
+ memset(pPage, 0, sizeof(Page));
+ pPage->aData = aData;
+ }
+
+ if( pPage ){
+ pPage->flags = PAGE_FREE;
+ }
+ *ppOut = pPage;
+ return rc;
+}
+
+/*
+** Assuming *pRc is initially LSM_OK, attempt to ensure that the
+** memory-mapped region is at least iSz bytes in size. If it is not already,
+** iSz bytes in size, extend it and update the pointers associated with any
+** outstanding Page objects.
+**
+** If *pRc is not LSM_OK when this function is called, it is a no-op.
+** Otherwise, *pRc is set to an lsm error code if an error occurs, or
+** left unmodified otherwise.
+**
+** This function is never called in compressed database mode.
+*/
+static void fsGrowMapping(
+ FileSystem *pFS, /* File system object */
+ i64 iSz, /* Minimum size to extend mapping to */
+ int *pRc /* IN/OUT: Error code */
+){
+ assert( pFS->pCompress==0 );
+ assert( PAGE_HASPREV==4 );
+
+ if( *pRc==LSM_OK && iSz>pFS->nMap ){
+ int rc;
+ u8 *aOld = pFS->pMap;
+ rc = lsmEnvRemap(pFS->pEnv, pFS->fdDb, iSz, &pFS->pMap, &pFS->nMap);
+ if( rc==LSM_OK && pFS->pMap!=aOld ){
+ Page *pFix;
+ i64 iOff = (u8 *)pFS->pMap - aOld;
+ for(pFix=pFS->pMapped; pFix; pFix=pFix->pMappedNext){
+ pFix->aData += iOff;
+ }
+ lsmSortedRemap(pFS->pDb);
+ }
+ *pRc = rc;
+ }
+}
+
+/*
+** If it is mapped, unmap the database file.
+*/
+int lsmFsUnmap(FileSystem *pFS){
+ int rc = LSM_OK;
+ if( pFS ){
+ rc = lsmEnvRemap(pFS->pEnv, pFS->fdDb, -1, &pFS->pMap, &pFS->nMap);
+ }
+ return rc;
+}
+
+/*
+** fsync() the database file.
+*/
+int lsmFsSyncDb(FileSystem *pFS, int nBlock){
+ return lsmEnvSync(pFS->pEnv, pFS->fdDb);
+}
+
+/*
+** If block iBlk has been redirected according to the redirections in the
+** object passed as the first argument, return the destination block to
+** which it is redirected. Otherwise, return a copy of iBlk.
+*/
+static int fsRedirectBlock(Redirect *p, int iBlk){
+ if( p ){
+ int i;
+ for(i=0; in; i++){
+ if( iBlk==p->a[i].iFrom ) return p->a[i].iTo;
+ }
+ }
+ assert( iBlk!=0 );
+ return iBlk;
+}
+
+/*
+** If page iPg has been redirected according to the redirections in the
+** object passed as the second argument, return the destination page to
+** which it is redirected. Otherwise, return a copy of iPg.
+*/
+Pgno lsmFsRedirectPage(FileSystem *pFS, Redirect *pRedir, Pgno iPg){
+ Pgno iReal = iPg;
+
+ if( pRedir ){
+ const int nPagePerBlock = (
+ pFS->pCompress ? pFS->nBlocksize : (pFS->nBlocksize / pFS->nPagesize)
+ );
+ int iBlk = fsPageToBlock(pFS, iPg);
+ int i;
+ for(i=0; in; i++){
+ int iFrom = pRedir->a[i].iFrom;
+ if( iFrom>iBlk ) break;
+ if( iFrom==iBlk ){
+ int iTo = pRedir->a[i].iTo;
+ iReal = iPg - (Pgno)(iFrom - iTo) * nPagePerBlock;
+ if( iTo==1 ){
+ iReal += (fsFirstPageOnBlock(pFS, 1)-1);
+ }
+ break;
+ }
+ }
+ }
+
+ assert( iReal!=0 );
+ return iReal;
+}
+
+/* Required by the circular fsBlockNext<->fsPageGet dependency. */
+static int fsPageGet(FileSystem *, Segment *, Pgno, int, Page **, int *);
+
+/*
+** Parameter iBlock is a database file block. This function reads the value
+** stored in the blocks "next block" pointer and stores it in *piNext.
+** LSM_OK is returned if everything is successful, or an LSM error code
+** otherwise.
+*/
+static int fsBlockNext(
+ FileSystem *pFS, /* File-system object handle */
+ Segment *pSeg, /* Use this segment for block redirects */
+ int iBlock, /* Read field from this block */
+ int *piNext /* OUT: Next block in linked list */
+){
+ int rc;
+ int iRead; /* Read block from here */
+
+ if( pSeg ){
+ iRead = fsRedirectBlock(pSeg->pRedirect, iBlock);
+ }else{
+ iRead = iBlock;
+ }
+
+ assert( pFS->nMapLimit==0 || pFS->pCompress==0 );
+ if( pFS->pCompress ){
+ i64 iOff; /* File offset to read data from */
+ u8 aNext[4]; /* 4-byte pointer read from db file */
+
+ iOff = (i64)iRead * pFS->nBlocksize - sizeof(aNext);
+ rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aNext, sizeof(aNext));
+ if( rc==LSM_OK ){
+ *piNext = (int)lsmGetU32(aNext);
+ }
+ }else{
+ const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
+ Page *pLast;
+ rc = fsPageGet(pFS, 0, iRead*nPagePerBlock, 0, &pLast, 0);
+ if( rc==LSM_OK ){
+ *piNext = lsmGetU32(&pLast->aData[pFS->nPagesize-4]);
+ lsmFsPageRelease(pLast);
+ }
+ }
+
+ if( pSeg ){
+ *piNext = fsRedirectBlock(pSeg->pRedirect, *piNext);
+ }
+ return rc;
+}
+
+/*
+** Return the page number of the last page on the same block as page iPg.
+*/
+Pgno fsLastPageOnPagesBlock(FileSystem *pFS, Pgno iPg){
+ return fsLastPageOnBlock(pFS, fsPageToBlock(pFS, iPg));
+}
+
+/*
+** Read nData bytes of data from offset iOff of the database file into
+** buffer aData. If this means reading past the end of a block, follow
+** the block pointer to the next block and continue reading.
+**
+** Offset iOff is an absolute offset - not subject to any block redirection.
+** However any block pointer followed is. Use pSeg->pRedirect in this case.
+**
+** This function is only called in compressed database mode.
+*/
+static int fsReadData(
+ FileSystem *pFS, /* File-system handle */
+ Segment *pSeg, /* Block redirection */
+ i64 iOff, /* Read data from this offset */
+ u8 *aData, /* Buffer to read data into */
+ int nData /* Number of bytes to read */
+){
+ i64 iEob; /* End of block */
+ int nRead;
+ int rc;
+
+ assert( pFS->pCompress );
+
+ iEob = fsLastPageOnPagesBlock(pFS, iOff) + 1;
+ nRead = (int)LSM_MIN(iEob - iOff, nData);
+
+ rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aData, nRead);
+ if( rc==LSM_OK && nRead!=nData ){
+ int iBlk;
+
+ rc = fsBlockNext(pFS, pSeg, fsPageToBlock(pFS, iOff), &iBlk);
+ if( rc==LSM_OK ){
+ i64 iOff2 = fsFirstPageOnBlock(pFS, iBlk);
+ rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff2, &aData[nRead], nData-nRead);
+ }
+ }
+
+ return rc;
+}
+
+/*
+** Parameter iBlock is a database file block. This function reads the value
+** stored in the blocks "previous block" pointer and stores it in *piPrev.
+** LSM_OK is returned if everything is successful, or an LSM error code
+** otherwise.
+*/
+static int fsBlockPrev(
+ FileSystem *pFS, /* File-system object handle */
+ Segment *pSeg, /* Use this segment for block redirects */
+ int iBlock, /* Read field from this block */
+ int *piPrev /* OUT: Previous block in linked list */
+){
+ int rc = LSM_OK; /* Return code */
+
+ assert( pFS->nMapLimit==0 || pFS->pCompress==0 );
+ assert( iBlock>0 );
+
+ if( pFS->pCompress ){
+ i64 iOff = fsFirstPageOnBlock(pFS, iBlock) - 4;
+ u8 aPrev[4]; /* 4-byte pointer read from db file */
+ rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aPrev, sizeof(aPrev));
+ if( rc==LSM_OK ){
+ Redirect *pRedir = (pSeg ? pSeg->pRedirect : 0);
+ *piPrev = fsRedirectBlock(pRedir, (int)lsmGetU32(aPrev));
+ }
+ }else{
+ assert( 0 );
+ }
+ return rc;
+}
+
+/*
+** Encode and decode routines for record size fields.
+*/
+static void putRecordSize(u8 *aBuf, int nByte, int bFree){
+ aBuf[0] = (u8)(nByte >> 14) | 0x80;
+ aBuf[1] = ((u8)(nByte >> 7) & 0x7F) | (bFree ? 0x00 : 0x80);
+ aBuf[2] = (u8)nByte | 0x80;
+}
+static int getRecordSize(u8 *aBuf, int *pbFree){
+ int nByte;
+ nByte = (aBuf[0] & 0x7F) << 14;
+ nByte += (aBuf[1] & 0x7F) << 7;
+ nByte += (aBuf[2] & 0x7F);
+ *pbFree = !(aBuf[1] & 0x80);
+ return nByte;
+}
+
+/*
+** Subtract iSub from database file offset iOff and set *piRes to the
+** result. If doing so means passing the start of a block, follow the
+** block pointer stored in the first 4 bytes of the block.
+**
+** Offset iOff is an absolute offset - not subject to any block redirection.
+** However any block pointer followed is. Use pSeg->pRedirect in this case.
+**
+** Return LSM_OK if successful or an lsm error code if an error occurs.
+*/
+static int fsSubtractOffset(
+ FileSystem *pFS,
+ Segment *pSeg,
+ i64 iOff,
+ int iSub,
+ i64 *piRes
+){
+ i64 iStart;
+ int iBlk = 0;
+ int rc;
+
+ assert( pFS->pCompress );
+
+ iStart = fsFirstPageOnBlock(pFS, fsPageToBlock(pFS, iOff));
+ if( (iOff-iSub)>=iStart ){
+ *piRes = (iOff-iSub);
+ return LSM_OK;
+ }
+
+ rc = fsBlockPrev(pFS, pSeg, fsPageToBlock(pFS, iOff), &iBlk);
+ *piRes = fsLastPageOnBlock(pFS, iBlk) - iSub + (iOff - iStart + 1);
+ return rc;
+}
+
+/*
+** Add iAdd to database file offset iOff and set *piRes to the
+** result. If doing so means passing the end of a block, follow the
+** block pointer stored in the last 4 bytes of the block.
+**
+** Offset iOff is an absolute offset - not subject to any block redirection.
+** However any block pointer followed is. Use pSeg->pRedirect in this case.
+**
+** Return LSM_OK if successful or an lsm error code if an error occurs.
+*/
+static int fsAddOffset(
+ FileSystem *pFS,
+ Segment *pSeg,
+ i64 iOff,
+ int iAdd,
+ i64 *piRes
+){
+ i64 iEob;
+ int iBlk;
+ int rc;
+
+ assert( pFS->pCompress );
+
+ iEob = fsLastPageOnPagesBlock(pFS, iOff);
+ if( (iOff+iAdd)<=iEob ){
+ *piRes = (iOff+iAdd);
+ return LSM_OK;
+ }
+
+ rc = fsBlockNext(pFS, pSeg, fsPageToBlock(pFS, iOff), &iBlk);
+ *piRes = fsFirstPageOnBlock(pFS, iBlk) + iAdd - (iEob - iOff + 1);
+ return rc;
+}
+
+/*
+** If it is not already allocated, allocate either the FileSystem.aOBuffer (if
+** bWrite is true) or the FileSystem.aIBuffer (if bWrite is false). Return
+** LSM_OK if successful if the attempt to allocate memory fails.
+*/
+static int fsAllocateBuffer(FileSystem *pFS, int bWrite){
+ u8 **pp; /* Pointer to either aIBuffer or aOBuffer */
+
+ assert( pFS->pCompress );
+
+ /* If neither buffer has been allocated, figure out how large they
+ ** should be. Store this value in FileSystem.nBuffer. */
+ if( pFS->nBuffer==0 ){
+ assert( pFS->aIBuffer==0 && pFS->aOBuffer==0 );
+ pFS->nBuffer = pFS->pCompress->xBound(pFS->pCompress->pCtx, pFS->nPagesize);
+ if( pFS->nBuffer<(pFS->szSector+6) ){
+ pFS->nBuffer = pFS->szSector+6;
+ }
+ }
+
+ pp = (bWrite ? &pFS->aOBuffer : &pFS->aIBuffer);
+ if( *pp==0 ){
+ *pp = lsmMalloc(pFS->pEnv, LSM_MAX(pFS->nBuffer, pFS->nPagesize));
+ if( *pp==0 ) return LSM_NOMEM_BKPT;
+ }
+
+ return LSM_OK;
+}
+
+/*
+** This function is only called in compressed database mode. It reads and
+** uncompresses the compressed data for page pPg from the database and
+** populates the pPg->aData[] buffer and pPg->nCompress field.
+**
+** It is possible that instead of a page record, there is free space
+** at offset pPg->iPgno. In this case no data is read from the file, but
+** output variable *pnSpace is set to the total number of free bytes.
+**
+** LSM_OK is returned if successful, or an LSM error code otherwise.
+*/
+static int fsReadPagedata(
+ FileSystem *pFS, /* File-system handle */
+ Segment *pSeg, /* pPg is part of this segment */
+ Page *pPg, /* Page to read and uncompress data for */
+ int *pnSpace /* OUT: Total bytes of free space */
+){
+ lsm_compress *p = pFS->pCompress;
+ i64 iOff = pPg->iPg;
+ u8 aSz[3];
+ int rc;
+
+ assert( p && pPg->nCompress==0 );
+
+ if( fsAllocateBuffer(pFS, 0) ) return LSM_NOMEM;
+
+ rc = fsReadData(pFS, pSeg, iOff, aSz, sizeof(aSz));
+
+ if( rc==LSM_OK ){
+ int bFree;
+ if( aSz[0] & 0x80 ){
+ pPg->nCompress = (int)getRecordSize(aSz, &bFree);
+ }else{
+ pPg->nCompress = (int)aSz[0] - sizeof(aSz)*2;
+ bFree = 1;
+ }
+ if( bFree ){
+ if( pnSpace ){
+ *pnSpace = pPg->nCompress + sizeof(aSz)*2;
+ }else{
+ rc = LSM_CORRUPT_BKPT;
+ }
+ }else{
+ rc = fsAddOffset(pFS, pSeg, iOff, 3, &iOff);
+ if( rc==LSM_OK ){
+ if( pPg->nCompress>pFS->nBuffer ){
+ rc = LSM_CORRUPT_BKPT;
+ }else{
+ rc = fsReadData(pFS, pSeg, iOff, pFS->aIBuffer, pPg->nCompress);
+ }
+ if( rc==LSM_OK ){
+ int n = pFS->nPagesize;
+ rc = p->xUncompress(p->pCtx,
+ (char *)pPg->aData, &n,
+ (const char *)pFS->aIBuffer, pPg->nCompress
+ );
+ if( rc==LSM_OK && n!=pPg->pFS->nPagesize ){
+ rc = LSM_CORRUPT_BKPT;
+ }
+ }
+ }
+ }
+ }
+ return rc;
+}
+
+/*
+** Return a handle for a database page.
+**
+** If this file-system object is accessing a compressed database it may be
+** that there is no page record at database file offset iPg. Instead, there
+** may be a free space record. In this case, set *ppPg to NULL and *pnSpace
+** to the total number of free bytes before returning.
+**
+** If no error occurs, LSM_OK is returned. Otherwise, an lsm error code.
+*/
+static int fsPageGet(
+ FileSystem *pFS, /* File-system handle */
+ Segment *pSeg, /* Block redirection to use (or NULL) */
+ Pgno iPg, /* Page id */
+ int noContent, /* True to not load content from disk */
+ Page **ppPg, /* OUT: New page handle */
+ int *pnSpace /* OUT: Bytes of free space */
+){
+ Page *p;
+ int iHash;
+ int rc = LSM_OK;
+
+ /* In most cases iReal is the same as iPg. Except, if pSeg->pRedirect is
+ ** not NULL, and the block containing iPg has been redirected, then iReal
+ ** is the page number after redirection. */
+ Pgno iReal = lsmFsRedirectPage(pFS, (pSeg ? pSeg->pRedirect : 0), iPg);
+
+ assert_lists_are_ok(pFS);
+ assert( iPg>=fsFirstPageOnBlock(pFS, 1) );
+ assert( iReal>=fsFirstPageOnBlock(pFS, 1) );
+ *ppPg = 0;
+
+ /* Search the hash-table for the page */
+ p = fsPageFindInHash(pFS, iReal, &iHash);
+
+ if( p ){
+ assert( p->flags & PAGE_FREE );
+ if( p->nRef==0 ) fsPageRemoveFromLru(pFS, p);
+ }else{
+
+ if( fsMmapPage(pFS, iReal) ){
+ i64 iEnd = (i64)iReal * pFS->nPagesize;
+ fsGrowMapping(pFS, iEnd, &rc);
+ if( rc!=LSM_OK ) return rc;
+
+ if( pFS->pFree ){
+ p = pFS->pFree;
+ pFS->pFree = p->pFreeNext;
+ assert( p->nRef==0 );
+ }else{
+ p = lsmMallocZeroRc(pFS->pEnv, sizeof(Page), &rc);
+ if( rc ) return rc;
+ p->pFS = pFS;
+ }
+ p->aData = &((u8 *)pFS->pMap)[pFS->nPagesize * (iReal-1)];
+ p->iPg = iReal;
+
+ /* This page now carries a pointer to the mapping. Link it in to
+ ** the FileSystem.pMapped list. */
+ assert( p->pMappedNext==0 );
+ p->pMappedNext = pFS->pMapped;
+ pFS->pMapped = p;
+
+ assert( pFS->pCompress==0 );
+ assert( (p->flags & PAGE_FREE)==0 );
+ }else{
+ rc = fsPageBuffer(pFS, &p);
+ if( rc==LSM_OK ){
+ int nSpace = 0;
+ p->iPg = iReal;
+ p->nRef = 0;
+ p->pFS = pFS;
+ assert( p->flags==0 || p->flags==PAGE_FREE );
+
+#ifdef LSM_DEBUG
+ memset(p->aData, 0x56, pFS->nPagesize);
+#endif
+ assert( p->pLruNext==0 && p->pLruPrev==0 );
+ if( noContent==0 ){
+ if( pFS->pCompress ){
+ rc = fsReadPagedata(pFS, pSeg, p, &nSpace);
+ }else{
+ int nByte = pFS->nPagesize;
+ i64 iOff = (i64)(iReal-1) * pFS->nPagesize;
+ rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, p->aData, nByte);
+ }
+ pFS->nRead++;
+ }
+
+ /* If the xRead() call was successful (or not attempted), link the
+ ** page into the page-cache hash-table. Otherwise, if it failed,
+ ** free the buffer. */
+ if( rc==LSM_OK && nSpace==0 ){
+ p->pHashNext = pFS->apHash[iHash];
+ pFS->apHash[iHash] = p;
+ }else{
+ fsPageBufferFree(p);
+ p = 0;
+ if( pnSpace ) *pnSpace = nSpace;
+ }
+ }
+ }
+
+ assert( (rc==LSM_OK && (p || (pnSpace && *pnSpace)))
+ || (rc!=LSM_OK && p==0)
+ );
+ }
+
+ if( rc==LSM_OK && p ){
+ if( pFS->pCompress==0 && (fsIsLast(pFS, iReal) || fsIsFirst(pFS, iReal)) ){
+ p->nData = pFS->nPagesize - 4;
+ if( fsIsFirst(pFS, iReal) && p->nRef==0 ){
+ p->aData += 4;
+ p->flags |= PAGE_HASPREV;
+ }
+ }else{
+ p->nData = pFS->nPagesize;
+ }
+ pFS->nOut += (p->nRef==0);
+ p->nRef++;
+ }
+ *ppPg = p;
+ return rc;
+}
+
+/*
+** Read the 64-bit checkpoint id of the checkpoint currently stored on meta
+** page iMeta of the database file. If no error occurs, store the id value
+** in *piVal and return LSM_OK. Otherwise, return an LSM error code and leave
+** *piVal unmodified.
+**
+** If a checkpointer connection is currently updating meta-page iMeta, or an
+** earlier checkpointer crashed while doing so, the value read into *piVal
+** may be garbage. It is the callers responsibility to deal with this.
+*/
+int lsmFsReadSyncedId(lsm_db *db, int iMeta, i64 *piVal){
+ FileSystem *pFS = db->pFS;
+ int rc = LSM_OK;
+
+ assert( iMeta==1 || iMeta==2 );
+ if( pFS->nMapLimit>0 ){
+ fsGrowMapping(pFS, iMeta*LSM_META_PAGE_SIZE, &rc);
+ if( rc==LSM_OK ){
+ *piVal = (i64)lsmGetU64(&((u8 *)pFS->pMap)[(iMeta-1)*LSM_META_PAGE_SIZE]);
+ }
+ }else{
+ MetaPage *pMeta = 0;
+ rc = lsmFsMetaPageGet(pFS, 0, iMeta, &pMeta);
+ if( rc==LSM_OK ){
+ *piVal = (i64)lsmGetU64(pMeta->aData);
+ lsmFsMetaPageRelease(pMeta);
+ }
+ }
+
+ return rc;
+}
+
+
+/*
+** Return true if the first or last page of segment pRun falls between iFirst
+** and iLast, inclusive, and pRun is not equal to pIgnore.
+*/
+static int fsRunEndsBetween(
+ Segment *pRun,
+ Segment *pIgnore,
+ Pgno iFirst,
+ Pgno iLast
+){
+ return (pRun!=pIgnore && (
+ (pRun->iFirst>=iFirst && pRun->iFirst<=iLast)
+ || (pRun->iLastPg>=iFirst && pRun->iLastPg<=iLast)
+ ));
+}
+
+/*
+** Return true if level pLevel contains a segment other than pIgnore for
+** which the first or last page is between iFirst and iLast, inclusive.
+*/
+static int fsLevelEndsBetween(
+ Level *pLevel,
+ Segment *pIgnore,
+ Pgno iFirst,
+ Pgno iLast
+){
+ int i;
+
+ if( fsRunEndsBetween(&pLevel->lhs, pIgnore, iFirst, iLast) ){
+ return 1;
+ }
+ for(i=0; inRight; i++){
+ if( fsRunEndsBetween(&pLevel->aRhs[i], pIgnore, iFirst, iLast) ){
+ return 1;
+ }
+ }
+
+ return 0;
+}
+
+/*
+** Block iBlk is no longer in use by segment pIgnore. If it is not in use
+** by any other segment, move it to the free block list.
+*/
+static int fsFreeBlock(
+ FileSystem *pFS, /* File system object */
+ Snapshot *pSnapshot, /* Worker snapshot */
+ Segment *pIgnore, /* Ignore this run when searching */
+ int iBlk /* Block number of block to free */
+){
+ int rc = LSM_OK; /* Return code */
+ Pgno iFirst; /* First page on block iBlk */
+ Pgno iLast; /* Last page on block iBlk */
+ Level *pLevel; /* Used to iterate through levels */
+
+ int iIn; /* Used to iterate through append points */
+ int iOut = 0; /* Used to output append points */
+ Pgno *aApp = pSnapshot->aiAppend;
+
+ iFirst = fsFirstPageOnBlock(pFS, iBlk);
+ iLast = fsLastPageOnBlock(pFS, iBlk);
+
+ /* Check if any other run in the snapshot has a start or end page
+ ** within this block. If there is such a run, return early. */
+ for(pLevel=lsmDbSnapshotLevel(pSnapshot); pLevel; pLevel=pLevel->pNext){
+ if( fsLevelEndsBetween(pLevel, pIgnore, iFirst, iLast) ){
+ return LSM_OK;
+ }
+ }
+
+ /* Remove any entries that lie on this block from the append-list. */
+ for(iIn=0; iIniLast ){
+ aApp[iOut++] = aApp[iIn];
+ }
+ }
+ while( iOutpDb, iBlk);
+ }
+ return rc;
+}
+
+/*
+** Delete or otherwise recycle the blocks currently occupied by run pDel.
+*/
+int lsmFsSortedDelete(
+ FileSystem *pFS,
+ Snapshot *pSnapshot,
+ int bZero, /* True to zero the Segment structure */
+ Segment *pDel
+){
+ if( pDel->iFirst ){
+ int rc = LSM_OK;
+
+ int iBlk;
+ int iLastBlk;
+
+ iBlk = fsPageToBlock(pFS, pDel->iFirst);
+ iLastBlk = fsPageToBlock(pFS, pDel->iLastPg);
+
+ /* Mark all blocks currently used by this sorted run as free */
+ while( iBlk && rc==LSM_OK ){
+ int iNext = 0;
+ if( iBlk!=iLastBlk ){
+ rc = fsBlockNext(pFS, pDel, iBlk, &iNext);
+ }else if( bZero==0 && pDel->iLastPg!=fsLastPageOnBlock(pFS, iLastBlk) ){
+ break;
+ }
+ rc = fsFreeBlock(pFS, pSnapshot, pDel, iBlk);
+ iBlk = iNext;
+ }
+
+ if( pDel->pRedirect ){
+ assert( pDel->pRedirect==&pSnapshot->redirect );
+ pSnapshot->redirect.n = 0;
+ }
+
+ if( bZero ) memset(pDel, 0, sizeof(Segment));
+ }
+ return LSM_OK;
+}
+
+/*
+** aPgno is an array containing nPgno page numbers. Return the smallest page
+** number from the array that falls on block iBlk. Or, if none of the pages
+** in aPgno[] fall on block iBlk, return 0.
+*/
+static Pgno firstOnBlock(FileSystem *pFS, int iBlk, Pgno *aPgno, int nPgno){
+ Pgno iRet = 0;
+ int i;
+ for(i=0; ipRedirect, iPg));
+}
+
+/*
+** Return true if the second argument is not NULL and any of the first
+** last or root pages lie on a redirected block.
+*/
+static int fsSegmentRedirects(FileSystem *pFS, Segment *p){
+ return (p && (
+ fsPageRedirects(pFS, p, p->iFirst)
+ || fsPageRedirects(pFS, p, p->iRoot)
+ || fsPageRedirects(pFS, p, p->iLastPg)
+ ));
+}
+#endif
+
+/*
+** Argument aPgno is an array of nPgno page numbers. All pages belong to
+** the segment pRun. This function gobbles from the start of the run to the
+** first page that appears in aPgno[] (i.e. so that the aPgno[] entry is
+** the new first page of the run).
+*/
+void lsmFsGobble(
+ lsm_db *pDb,
+ Segment *pRun,
+ Pgno *aPgno,
+ int nPgno
+){
+ int rc = LSM_OK;
+ FileSystem *pFS = pDb->pFS;
+ Snapshot *pSnapshot = pDb->pWorker;
+ int iBlk;
+
+ assert( pRun->nSize>0 );
+ assert( 0==fsSegmentRedirects(pFS, pRun) );
+ assert( nPgno>0 && 0==fsPageRedirects(pFS, pRun, aPgno[0]) );
+
+ iBlk = fsPageToBlock(pFS, pRun->iFirst);
+ pRun->nSize += (int)(pRun->iFirst - fsFirstPageOnBlock(pFS, iBlk));
+
+ while( rc==LSM_OK ){
+ int iNext = 0;
+ Pgno iFirst = firstOnBlock(pFS, iBlk, aPgno, nPgno);
+ if( iFirst ){
+ pRun->iFirst = iFirst;
+ break;
+ }
+ rc = fsBlockNext(pFS, pRun, iBlk, &iNext);
+ if( rc==LSM_OK ) rc = fsFreeBlock(pFS, pSnapshot, pRun, iBlk);
+ pRun->nSize -= (int)(
+ 1 + fsLastPageOnBlock(pFS, iBlk) - fsFirstPageOnBlock(pFS, iBlk)
+ );
+ iBlk = iNext;
+ }
+
+ pRun->nSize -= (int)(pRun->iFirst - fsFirstPageOnBlock(pFS, iBlk));
+ assert( pRun->nSize>0 );
+}
+
+/*
+** This function is only used in compressed database mode.
+**
+** Argument iPg is the page number (byte offset) of a page within segment
+** pSeg. The page record, including all headers, is nByte bytes in size.
+** Before returning, set *piNext to the page number of the next page in
+** the segment, or to zero if iPg is the last.
+**
+** In other words, do:
+**
+** *piNext = iPg + nByte;
+**
+** But take block overflow and redirection into account.
+*/
+static int fsNextPageOffset(
+ FileSystem *pFS, /* File system object */
+ Segment *pSeg, /* Segment to move within */
+ Pgno iPg, /* Offset of current page */
+ int nByte, /* Size of current page including headers */
+ Pgno *piNext /* OUT: Offset of next page. Or zero (EOF) */
+){
+ Pgno iNext;
+ int rc;
+
+ assert( pFS->pCompress );
+
+ rc = fsAddOffset(pFS, pSeg, iPg, nByte-1, &iNext);
+ if( pSeg && iNext==pSeg->iLastPg ){
+ iNext = 0;
+ }else if( rc==LSM_OK ){
+ rc = fsAddOffset(pFS, pSeg, iNext, 1, &iNext);
+ }
+
+ *piNext = iNext;
+ return rc;
+}
+
+/*
+** This function is only used in compressed database mode.
+**
+** Argument iPg is the page number of a pagethat appears in segment pSeg.
+** This function determines the page number of the previous page in the
+** same run. *piPrev is set to the previous page number before returning.
+**
+** LSM_OK is returned if no error occurs. Otherwise, an lsm error code.
+** If any value other than LSM_OK is returned, then the final value of
+** *piPrev is undefined.
+*/
+static int fsGetPageBefore(
+ FileSystem *pFS,
+ Segment *pSeg,
+ Pgno iPg,
+ Pgno *piPrev
+){
+ u8 aSz[3];
+ int rc;
+ i64 iRead;
+
+ assert( pFS->pCompress );
+
+ rc = fsSubtractOffset(pFS, pSeg, iPg, sizeof(aSz), &iRead);
+ if( rc==LSM_OK ) rc = fsReadData(pFS, pSeg, iRead, aSz, sizeof(aSz));
+
+ if( rc==LSM_OK ){
+ int bFree;
+ int nSz;
+ if( aSz[2] & 0x80 ){
+ nSz = getRecordSize(aSz, &bFree) + sizeof(aSz)*2;
+ }else{
+ nSz = (int)(aSz[2] & 0x7F);
+ bFree = 1;
+ }
+ rc = fsSubtractOffset(pFS, pSeg, iPg, nSz, piPrev);
+ }
+
+ return rc;
+}
+
+/*
+** The first argument to this function is a valid reference to a database
+** file page that is part of a sorted run. If parameter eDir is -1, this
+** function attempts to locate and load the previous page in the same run.
+** Or, if eDir is +1, it attempts to find the next page in the same run.
+** The results of passing an eDir value other than positive or negative one
+** are undefined.
+**
+** If parameter pRun is not NULL then it must point to the run that page
+** pPg belongs to. In this case, if pPg is the first or last page of the
+** run, and the request is for the previous or next page, respectively,
+** *ppNext is set to NULL before returning LSM_OK. If pRun is NULL, then it
+** is assumed that the next or previous page, as requested, exists.
+**
+** If the previous/next page does exist and is successfully loaded, *ppNext
+** is set to point to it and LSM_OK is returned. Otherwise, if an error
+** occurs, *ppNext is set to NULL and and lsm error code returned.
+**
+** Page references returned by this function should be released by the
+** caller using lsmFsPageRelease().
+*/
+int lsmFsDbPageNext(Segment *pRun, Page *pPg, int eDir, Page **ppNext){
+ int rc = LSM_OK;
+ FileSystem *pFS = pPg->pFS;
+ Pgno iPg = pPg->iPg;
+
+ assert( 0==fsSegmentRedirects(pFS, pRun) );
+ if( pFS->pCompress ){
+ int nSpace = pPg->nCompress + 2*3;
+
+ do {
+ if( eDir>0 ){
+ rc = fsNextPageOffset(pFS, pRun, iPg, nSpace, &iPg);
+ }else{
+ if( iPg==pRun->iFirst ){
+ iPg = 0;
+ }else{
+ rc = fsGetPageBefore(pFS, pRun, iPg, &iPg);
+ }
+ }
+
+ nSpace = 0;
+ if( iPg!=0 ){
+ rc = fsPageGet(pFS, pRun, iPg, 0, ppNext, &nSpace);
+ assert( (*ppNext==0)==(rc!=LSM_OK || nSpace>0) );
+ }else{
+ *ppNext = 0;
+ }
+ }while( nSpace>0 && rc==LSM_OK );
+
+ }else{
+ Redirect *pRedir = pRun ? pRun->pRedirect : 0;
+ assert( eDir==1 || eDir==-1 );
+ if( eDir<0 ){
+ if( pRun && iPg==pRun->iFirst ){
+ *ppNext = 0;
+ return LSM_OK;
+ }else if( fsIsFirst(pFS, iPg) ){
+ assert( pPg->flags & PAGE_HASPREV );
+ iPg = fsLastPageOnBlock(pFS, lsmGetU32(&pPg->aData[-4]));
+ }else{
+ iPg--;
+ }
+ }else{
+ if( pRun ){
+ if( iPg==pRun->iLastPg ){
+ *ppNext = 0;
+ return LSM_OK;
+ }
+ }
+
+ if( fsIsLast(pFS, iPg) ){
+ int iBlk = fsRedirectBlock(
+ pRedir, lsmGetU32(&pPg->aData[pFS->nPagesize-4])
+ );
+ iPg = fsFirstPageOnBlock(pFS, iBlk);
+ }else{
+ iPg++;
+ }
+ }
+ rc = fsPageGet(pFS, pRun, iPg, 0, ppNext, 0);
+ }
+
+ return rc;
+}
+
+/*
+** This function is called when creating a new segment to determine if the
+** first part of it can be written following an existing segment on an
+** already allocated block. If it is possible, the page number of the first
+** page to use for the new segment is returned. Otherwise zero.
+**
+** If argument pLvl is not NULL, then this function will not attempt to
+** start the new segment immediately following any segment that is part
+** of the right-hand-side of pLvl.
+*/
+static Pgno findAppendPoint(FileSystem *pFS, Level *pLvl){
+ int i;
+ Pgno *aiAppend = pFS->pDb->pWorker->aiAppend;
+ Pgno iRet = 0;
+
+ for(i=LSM_APPLIST_SZ-1; iRet==0 && i>=0; i--){
+ if( (iRet = aiAppend[i]) ){
+ if( pLvl ){
+ int iBlk = fsPageToBlock(pFS, iRet);
+ int j;
+ for(j=0; iRet && jnRight; j++){
+ if( fsPageToBlock(pFS, pLvl->aRhs[j].iLastPg)==iBlk ){
+ iRet = 0;
+ }
+ }
+ }
+ if( iRet ) aiAppend[i] = 0;
+ }
+ }
+ return iRet;
+}
+
+/*
+** Append a page to the left-hand-side of pLvl. Set the ref-count to 1 and
+** return a pointer to it. The page is writable until either
+** lsmFsPagePersist() is called on it or the ref-count drops to zero.
+*/
+int lsmFsSortedAppend(
+ FileSystem *pFS,
+ Snapshot *pSnapshot,
+ Level *pLvl,
+ int bDefer,
+ Page **ppOut
+){
+ int rc = LSM_OK;
+ Page *pPg = 0;
+ Pgno iApp = 0;
+ Pgno iNext = 0;
+ Segment *p = &pLvl->lhs;
+ Pgno iPrev = p->iLastPg;
+
+ *ppOut = 0;
+ assert( p->pRedirect==0 );
+
+ if( pFS->pCompress || bDefer ){
+ /* In compressed database mode the page is not assigned a page number
+ ** or location in the database file at this point. This will be done
+ ** by the lsmFsPagePersist() call. */
+ rc = fsPageBuffer(pFS, &pPg);
+ if( rc==LSM_OK ){
+ pPg->pFS = pFS;
+ pPg->pSeg = p;
+ pPg->iPg = 0;
+ pPg->flags |= PAGE_DIRTY;
+ pPg->nData = pFS->nPagesize;
+ assert( pPg->aData );
+ if( pFS->pCompress==0 ) pPg->nData -= 4;
+
+ pPg->nRef = 1;
+ pFS->nOut++;
+ }
+ }else{
+ if( iPrev==0 ){
+ iApp = findAppendPoint(pFS, pLvl);
+ }else if( fsIsLast(pFS, iPrev) ){
+ int iNext2;
+ rc = fsBlockNext(pFS, 0, fsPageToBlock(pFS, iPrev), &iNext2);
+ if( rc!=LSM_OK ) return rc;
+ iApp = fsFirstPageOnBlock(pFS, iNext2);
+ }else{
+ iApp = iPrev + 1;
+ }
+
+ /* If this is the first page allocated, or if the page allocated is the
+ ** last in the block, also allocate the next block here. */
+ if( iApp==0 || fsIsLast(pFS, iApp) ){
+ int iNew; /* New block number */
+
+ rc = lsmBlockAllocate(pFS->pDb, 0, &iNew);
+ if( rc!=LSM_OK ) return rc;
+ if( iApp==0 ){
+ iApp = fsFirstPageOnBlock(pFS, iNew);
+ }else{
+ iNext = fsFirstPageOnBlock(pFS, iNew);
+ }
+ }
+
+ /* Grab the new page. */
+ pPg = 0;
+ rc = fsPageGet(pFS, 0, iApp, 1, &pPg, 0);
+ assert( rc==LSM_OK || pPg==0 );
+
+ /* If this is the first or last page of a block, fill in the pointer
+ ** value at the end of the new page. */
+ if( rc==LSM_OK ){
+ p->nSize++;
+ p->iLastPg = iApp;
+ if( p->iFirst==0 ) p->iFirst = iApp;
+ pPg->flags |= PAGE_DIRTY;
+
+ if( fsIsLast(pFS, iApp) ){
+ lsmPutU32(&pPg->aData[pFS->nPagesize-4], fsPageToBlock(pFS, iNext));
+ }else if( fsIsFirst(pFS, iApp) ){
+ lsmPutU32(&pPg->aData[-4], fsPageToBlock(pFS, iPrev));
+ }
+ }
+ }
+
+ *ppOut = pPg;
+ return rc;
+}
+
+/*
+** Mark the segment passed as the second argument as finished. Once a segment
+** is marked as finished it is not possible to append any further pages to
+** it.
+**
+** Return LSM_OK if successful or an lsm error code if an error occurs.
+*/
+int lsmFsSortedFinish(FileSystem *pFS, Segment *p){
+ int rc = LSM_OK;
+ if( p && p->iLastPg ){
+ assert( p->pRedirect==0 );
+
+ /* Check if the last page of this run happens to be the last of a block.
+ ** If it is, then an extra block has already been allocated for this run.
+ ** Shift this extra block back to the free-block list.
+ **
+ ** Otherwise, add the first free page in the last block used by the run
+ ** to the lAppend list.
+ */
+ if( fsLastPageOnPagesBlock(pFS, p->iLastPg)!=p->iLastPg ){
+ int i;
+ Pgno *aiAppend = pFS->pDb->pWorker->aiAppend;
+ for(i=0; iiLastPg+1;
+ break;
+ }
+ }
+ }else if( pFS->pCompress==0 ){
+ Page *pLast;
+ rc = fsPageGet(pFS, 0, p->iLastPg, 0, &pLast, 0);
+ if( rc==LSM_OK ){
+ int iBlk = (int)lsmGetU32(&pLast->aData[pFS->nPagesize-4]);
+ lsmBlockRefree(pFS->pDb, iBlk);
+ lsmFsPageRelease(pLast);
+ }
+ }else{
+ int iBlk = 0;
+ rc = fsBlockNext(pFS, p, fsPageToBlock(pFS, p->iLastPg), &iBlk);
+ if( rc==LSM_OK ){
+ lsmBlockRefree(pFS->pDb, iBlk);
+ }
+ }
+ }
+ return rc;
+}
+
+/*
+** Obtain a reference to page number iPg.
+**
+** Return LSM_OK if successful, or an lsm error code if an error occurs.
+*/
+int lsmFsDbPageGet(FileSystem *pFS, Segment *pSeg, Pgno iPg, Page **ppPg){
+ return fsPageGet(pFS, pSeg, iPg, 0, ppPg, 0);
+}
+
+/*
+** Obtain a reference to the last page in the segment passed as the
+** second argument.
+**
+** Return LSM_OK if successful, or an lsm error code if an error occurs.
+*/
+int lsmFsDbPageLast(FileSystem *pFS, Segment *pSeg, Page **ppPg){
+ int rc;
+ Pgno iPg = pSeg->iLastPg;
+ if( pFS->pCompress ){
+ int nSpace;
+ iPg++;
+ do {
+ nSpace = 0;
+ rc = fsGetPageBefore(pFS, pSeg, iPg, &iPg);
+ if( rc==LSM_OK ){
+ rc = fsPageGet(pFS, pSeg, iPg, 0, ppPg, &nSpace);
+ }
+ }while( rc==LSM_OK && nSpace>0 );
+
+ }else{
+ rc = fsPageGet(pFS, pSeg, iPg, 0, ppPg, 0);
+ }
+ return rc;
+}
+
+/*
+** Return a reference to meta-page iPg. If successful, LSM_OK is returned
+** and *ppPg populated with the new page reference. The reference should
+** be released by the caller using lsmFsPageRelease().
+**
+** Otherwise, if an error occurs, *ppPg is set to NULL and an LSM error
+** code is returned.
+*/
+int lsmFsMetaPageGet(
+ FileSystem *pFS, /* File-system connection */
+ int bWrite, /* True for write access, false for read */
+ int iPg, /* Either 1 or 2 */
+ MetaPage **ppPg /* OUT: Pointer to MetaPage object */
+){
+ int rc = LSM_OK;
+ MetaPage *pPg;
+ assert( iPg==1 || iPg==2 );
+
+ pPg = lsmMallocZeroRc(pFS->pEnv, sizeof(Page), &rc);
+
+ if( pPg ){
+ i64 iOff = (iPg-1) * pFS->nMetasize;
+ if( pFS->nMapLimit>0 ){
+ fsGrowMapping(pFS, 2*pFS->nMetasize, &rc);
+ pPg->aData = (u8 *)(pFS->pMap) + iOff;
+ }else{
+ pPg->aData = lsmMallocRc(pFS->pEnv, pFS->nMetasize, &rc);
+ if( rc==LSM_OK && bWrite==0 ){
+ rc = lsmEnvRead(
+ pFS->pEnv, pFS->fdDb, iOff, pPg->aData, pFS->nMetaRwSize
+ );
+ }
+#ifndef NDEBUG
+ /* pPg->aData causes an uninitialized access via a downstreadm write().
+ After discussion on this list, this memory should not, for performance
+ reasons, be memset. However, tracking down "real" misuse is more
+ difficult with this "false" positive, so it is set when NDEBUG.
+ */
+ else if( rc==LSM_OK ){
+ memset( pPg->aData, 0x77, pFS->nMetasize );
+ }
+#endif
+ }
+
+ if( rc!=LSM_OK ){
+ if( pFS->nMapLimit==0 ) lsmFree(pFS->pEnv, pPg->aData);
+ lsmFree(pFS->pEnv, pPg);
+ pPg = 0;
+ }else{
+ pPg->iPg = iPg;
+ pPg->bWrite = bWrite;
+ pPg->pFS = pFS;
+ }
+ }
+
+ *ppPg = pPg;
+ return rc;
+}
+
+/*
+** Release a meta-page reference obtained via a call to lsmFsMetaPageGet().
+*/
+int lsmFsMetaPageRelease(MetaPage *pPg){
+ int rc = LSM_OK;
+ if( pPg ){
+ FileSystem *pFS = pPg->pFS;
+
+ if( pFS->nMapLimit==0 ){
+ if( pPg->bWrite ){
+ i64 iOff = (pPg->iPg==2 ? pFS->nMetasize : 0);
+ int nWrite = pFS->nMetaRwSize;
+ rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, pPg->aData, nWrite);
+ }
+ lsmFree(pFS->pEnv, pPg->aData);
+ }
+
+ lsmFree(pFS->pEnv, pPg);
+ }
+ return rc;
+}
+
+/*
+** Return a pointer to a buffer containing the data associated with the
+** meta-page passed as the first argument. If parameter pnData is not NULL,
+** set *pnData to the size of the meta-page in bytes before returning.
+*/
+u8 *lsmFsMetaPageData(MetaPage *pPg, int *pnData){
+ if( pnData ) *pnData = pPg->pFS->nMetaRwSize;
+ return pPg->aData;
+}
+
+/*
+** Return true if page is currently writable. This is used in assert()
+** statements only.
+*/
+#ifndef NDEBUG
+int lsmFsPageWritable(Page *pPg){
+ return (pPg->flags & PAGE_DIRTY) ? 1 : 0;
+}
+#endif
+
+/*
+** This is called when block iFrom is being redirected to iTo. If page
+** number (*piPg) lies on block iFrom, then calculate the equivalent
+** page on block iTo and set *piPg to this value before returning.
+*/
+static void fsMovePage(
+ FileSystem *pFS, /* File system object */
+ int iTo, /* Destination block */
+ int iFrom, /* Source block */
+ Pgno *piPg /* IN/OUT: Page number */
+){
+ Pgno iPg = *piPg;
+ if( iFrom==fsPageToBlock(pFS, iPg) ){
+ const int nPagePerBlock = (
+ pFS->pCompress ? pFS ->nBlocksize : (pFS->nBlocksize / pFS->nPagesize)
+ );
+ *piPg = iPg - (Pgno)(iFrom - iTo) * nPagePerBlock;
+ }
+}
+
+/*
+** Copy the contents of block iFrom to block iTo.
+**
+** It is safe to assume that there are no outstanding references to pages
+** on block iTo. And that block iFrom is not currently being written. In
+** other words, the data can be read and written directly.
+*/
+int lsmFsMoveBlock(FileSystem *pFS, Segment *pSeg, int iTo, int iFrom){
+ Snapshot *p = pFS->pDb->pWorker;
+ int rc = LSM_OK;
+ int i;
+ i64 nMap;
+
+ i64 iFromOff = (i64)(iFrom-1) * pFS->nBlocksize;
+ i64 iToOff = (i64)(iTo-1) * pFS->nBlocksize;
+
+ assert( iTo!=1 );
+ assert( iFrom>iTo );
+
+ /* Grow the mapping as required. */
+ nMap = LSM_MIN(pFS->nMapLimit, (i64)iFrom * pFS->nBlocksize);
+ fsGrowMapping(pFS, nMap, &rc);
+
+ if( rc==LSM_OK ){
+ const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
+ int nSz = pFS->nPagesize;
+ u8 *aBuf = 0;
+ u8 *aData = 0;
+
+ for(i=0; rc==LSM_OK && inMapLimit ){
+ u8 *aMap = (u8 *)(pFS->pMap);
+ aData = &aMap[iOff];
+ }else{
+ if( aBuf==0 ){
+ aBuf = (u8 *)lsmMallocRc(pFS->pEnv, nSz, &rc);
+ if( aBuf==0 ) break;
+ }
+ aData = aBuf;
+ rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aData, nSz);
+ }
+
+ /* Copy aData to the to page */
+ if( rc==LSM_OK ){
+ iOff = iToOff + i*nSz;
+ if( (iOff+nSz)<=pFS->nMapLimit ){
+ u8 *aMap = (u8 *)(pFS->pMap);
+ memcpy(&aMap[iOff], aData, nSz);
+ }else{
+ rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, aData, nSz);
+ }
+ }
+ }
+ lsmFree(pFS->pEnv, aBuf);
+ lsmFsPurgeCache(pFS);
+ }
+
+ /* Update append-point list if necessary */
+ for(i=0; iaiAppend[i]);
+ }
+
+ /* Update the Segment structure itself */
+ fsMovePage(pFS, iTo, iFrom, &pSeg->iFirst);
+ fsMovePage(pFS, iTo, iFrom, &pSeg->iLastPg);
+ fsMovePage(pFS, iTo, iFrom, &pSeg->iRoot);
+
+ return rc;
+}
+
+/*
+** Append raw data to a segment. Return the database file offset that the
+** data is written to (this may be used as the page number if the data
+** being appended is a new page record).
+**
+** This function is only used in compressed database mode.
+*/
+static Pgno fsAppendData(
+ FileSystem *pFS, /* File-system handle */
+ Segment *pSeg, /* Segment to append to */
+ const u8 *aData, /* Buffer containing data to write */
+ int nData, /* Size of buffer aData[] in bytes */
+ int *pRc /* IN/OUT: Error code */
+){
+ Pgno iRet = 0;
+ int rc = *pRc;
+ assert( pFS->pCompress );
+ if( rc==LSM_OK ){
+ int nRem = 0;
+ int nWrite = 0;
+ Pgno iLastOnBlock;
+ Pgno iApp = pSeg->iLastPg+1;
+
+ /* If this is the first data written into the segment, find an append-point
+ ** or allocate a new block. */
+ if( iApp==1 ){
+ pSeg->iFirst = iApp = findAppendPoint(pFS, 0);
+ if( iApp==0 ){
+ int iBlk;
+ rc = lsmBlockAllocate(pFS->pDb, 0, &iBlk);
+ pSeg->iFirst = iApp = fsFirstPageOnBlock(pFS, iBlk);
+ }
+ }
+ iRet = iApp;
+
+ /* Write as much data as is possible at iApp (usually all of it). */
+ iLastOnBlock = fsLastPageOnPagesBlock(pFS, iApp);
+ if( rc==LSM_OK ){
+ int nSpace = (int)(iLastOnBlock - iApp + 1);
+ nWrite = LSM_MIN(nData, nSpace);
+ nRem = nData - nWrite;
+ assert( nWrite>=0 );
+ if( nWrite!=0 ){
+ rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iApp, aData, nWrite);
+ }
+ iApp += nWrite;
+ }
+
+ /* If required, allocate a new block and write the rest of the data
+ ** into it. Set the next and previous block pointers to link the new
+ ** block to the old. */
+ assert( nRem<=0 || (iApp-1)==iLastOnBlock );
+ if( rc==LSM_OK && (iApp-1)==iLastOnBlock ){
+ u8 aPtr[4]; /* Space to serialize a u32 */
+ int iBlk; /* New block number */
+
+ if( nWrite>0 ){
+ /* Allocate a new block. */
+ rc = lsmBlockAllocate(pFS->pDb, 0, &iBlk);
+
+ /* Set the "next" pointer on the old block */
+ if( rc==LSM_OK ){
+ assert( iApp==(fsPageToBlock(pFS, iApp)*pFS->nBlocksize)-4 );
+ lsmPutU32(aPtr, iBlk);
+ rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iApp, aPtr, sizeof(aPtr));
+ }
+
+ /* Set the "prev" pointer on the new block */
+ if( rc==LSM_OK ){
+ Pgno iWrite;
+ lsmPutU32(aPtr, fsPageToBlock(pFS, iApp));
+ iWrite = fsFirstPageOnBlock(pFS, iBlk);
+ rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iWrite-4, aPtr, sizeof(aPtr));
+ if( nRem>0 ) iApp = iWrite;
+ }
+ }else{
+ /* The next block is already allocated. */
+ assert( nRem>0 );
+ assert( pSeg->pRedirect==0 );
+ rc = fsBlockNext(pFS, 0, fsPageToBlock(pFS, iApp), &iBlk);
+ iRet = iApp = fsFirstPageOnBlock(pFS, iBlk);
+ }
+
+ /* Write the remaining data into the new block */
+ if( rc==LSM_OK && nRem>0 ){
+ rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iApp, &aData[nWrite], nRem);
+ iApp += nRem;
+ }
+ }
+
+ pSeg->iLastPg = iApp-1;
+ *pRc = rc;
+ }
+
+ return iRet;
+}
+
+/*
+** This function is only called in compressed database mode. It
+** compresses the contents of page pPg and writes the result to the
+** buffer at pFS->aOBuffer. The size of the compressed data is stored in
+** pPg->nCompress.
+**
+** If buffer pFS->aOBuffer[] has not been allocated then this function
+** allocates it. If this fails, LSM_NOMEM is returned. Otherwise, LSM_OK.
+*/
+static int fsCompressIntoBuffer(FileSystem *pFS, Page *pPg){
+ lsm_compress *p = pFS->pCompress;
+
+ if( fsAllocateBuffer(pFS, 1) ) return LSM_NOMEM;
+ assert( pPg->nData==pFS->nPagesize );
+
+ pPg->nCompress = pFS->nBuffer;
+ return p->xCompress(p->pCtx,
+ (char *)pFS->aOBuffer, &pPg->nCompress,
+ (const char *)pPg->aData, pPg->nData
+ );
+}
+
+/*
+** Append a new page to segment pSeg. Set output variable *piNew to the
+** page number of the new page before returning.
+**
+** If the new page is the last on its block, then the 'next' block that
+** will be used by the segment is allocated here too. In this case output
+** variable *piNext is set to the block number of the next block.
+**
+** If the new page is the first on its block but not the first in the
+** entire segment, set output variable *piPrev to the block number of
+** the previous block in the segment.
+**
+** LSM_OK is returned if successful, or an lsm error code otherwise. If
+** any value other than LSM_OK is returned, then the final value of all
+** output variables is undefined.
+*/
+static int fsAppendPage(
+ FileSystem *pFS,
+ Segment *pSeg,
+ Pgno *piNew,
+ int *piPrev,
+ int *piNext
+){
+ Pgno iPrev = pSeg->iLastPg;
+ int rc;
+ assert( iPrev!=0 );
+
+ *piPrev = 0;
+ *piNext = 0;
+
+ if( fsIsLast(pFS, iPrev) ){
+ /* Grab the first page on the next block (which has already be
+ ** allocated). In this case set *piPrev to tell the caller to set
+ ** the "previous block" pointer in the first 4 bytes of the page.
+ */
+ int iNext;
+ int iBlk = fsPageToBlock(pFS, iPrev);
+ assert( pSeg->pRedirect==0 );
+ rc = fsBlockNext(pFS, 0, iBlk, &iNext);
+ if( rc!=LSM_OK ) return rc;
+ *piNew = fsFirstPageOnBlock(pFS, iNext);
+ *piPrev = iBlk;
+ }else{
+ *piNew = iPrev+1;
+ if( fsIsLast(pFS, *piNew) ){
+ /* Allocate the next block here. */
+ int iBlk;
+ rc = lsmBlockAllocate(pFS->pDb, 0, &iBlk);
+ if( rc!=LSM_OK ) return rc;
+ *piNext = iBlk;
+ }
+ }
+
+ pSeg->nSize++;
+ pSeg->iLastPg = *piNew;
+ return LSM_OK;
+}
+
+/*
+** Flush all pages in the FileSystem.pWaiting list to disk.
+*/
+void lsmFsFlushWaiting(FileSystem *pFS, int *pRc){
+ int rc = *pRc;
+ Page *pPg;
+
+ pPg = pFS->pWaiting;
+ pFS->pWaiting = 0;
+
+ while( pPg ){
+ Page *pNext = pPg->pWaitingNext;
+ if( rc==LSM_OK ) rc = lsmFsPagePersist(pPg);
+ assert( pPg->nRef==1 );
+ lsmFsPageRelease(pPg);
+ pPg = pNext;
+ }
+ *pRc = rc;
+}
+
+/*
+** If there exists a hash-table entry associated with page iPg, remove it.
+*/
+static void fsRemoveHashEntry(FileSystem *pFS, Pgno iPg){
+ Page *p;
+ int iHash = fsHashKey(pFS->nHash, iPg);
+
+ for(p=pFS->apHash[iHash]; p && p->iPg!=iPg; p=p->pHashNext);
+
+ if( p ){
+ assert( p->nRef==0 || (p->flags & PAGE_FREE)==0 );
+ fsPageRemoveFromHash(pFS, p);
+ p->iPg = 0;
+ iHash = fsHashKey(pFS->nHash, 0);
+ p->pHashNext = pFS->apHash[iHash];
+ pFS->apHash[iHash] = p;
+ }
+}
+
+/*
+** If the page passed as an argument is dirty, update the database file
+** (or mapping of the database file) with its current contents and mark
+** the page as clean.
+**
+** Return LSM_OK if the operation is a success, or an LSM error code
+** otherwise.
+*/
+int lsmFsPagePersist(Page *pPg){
+ int rc = LSM_OK;
+ if( pPg && (pPg->flags & PAGE_DIRTY) ){
+ FileSystem *pFS = pPg->pFS;
+
+ if( pFS->pCompress ){
+ int iHash; /* Hash key of assigned page number */
+ u8 aSz[3]; /* pPg->nCompress as a 24-bit big-endian */
+ assert( pPg->pSeg && pPg->iPg==0 && pPg->nCompress==0 );
+
+ /* Compress the page image. */
+ rc = fsCompressIntoBuffer(pFS, pPg);
+
+ /* Serialize the compressed size into buffer aSz[] */
+ putRecordSize(aSz, pPg->nCompress, 0);
+
+ /* Write the serialized page record into the database file. */
+ pPg->iPg = fsAppendData(pFS, pPg->pSeg, aSz, sizeof(aSz), &rc);
+ fsAppendData(pFS, pPg->pSeg, pFS->aOBuffer, pPg->nCompress, &rc);
+ fsAppendData(pFS, pPg->pSeg, aSz, sizeof(aSz), &rc);
+
+ /* Now that it has a page number, insert the page into the hash table */
+ iHash = fsHashKey(pFS->nHash, pPg->iPg);
+ pPg->pHashNext = pFS->apHash[iHash];
+ pFS->apHash[iHash] = pPg;
+
+ pPg->pSeg->nSize += (sizeof(aSz) * 2) + pPg->nCompress;
+
+ pPg->flags &= ~PAGE_DIRTY;
+ pFS->nWrite++;
+ }else{
+
+ if( pPg->iPg==0 ){
+ /* No page number has been assigned yet. This occurs with pages used
+ ** in the b-tree hierarchy. They were not assigned page numbers when
+ ** they were created as doing so would cause this call to
+ ** lsmFsPagePersist() to write an out-of-order page. Instead a page
+ ** number is assigned here so that the page data will be appended
+ ** to the current segment.
+ */
+ Page **pp;
+ int iPrev = 0;
+ int iNext = 0;
+ int iHash;
+
+ assert( pPg->pSeg->iFirst );
+ assert( pPg->flags & PAGE_FREE );
+ assert( (pPg->flags & PAGE_HASPREV)==0 );
+ assert( pPg->nData==pFS->nPagesize-4 );
+
+ rc = fsAppendPage(pFS, pPg->pSeg, &pPg->iPg, &iPrev, &iNext);
+ if( rc!=LSM_OK ) return rc;
+
+ assert( pPg->flags & PAGE_FREE );
+ iHash = fsHashKey(pFS->nHash, pPg->iPg);
+ fsRemoveHashEntry(pFS, pPg->iPg);
+ pPg->pHashNext = pFS->apHash[iHash];
+ pFS->apHash[iHash] = pPg;
+ assert( pPg->pHashNext==0 || pPg->pHashNext->iPg!=pPg->iPg );
+
+ if( iPrev ){
+ assert( iNext==0 );
+ memmove(&pPg->aData[4], pPg->aData, pPg->nData);
+ lsmPutU32(pPg->aData, iPrev);
+ pPg->flags |= PAGE_HASPREV;
+ pPg->aData += 4;
+ }else if( iNext ){
+ assert( iPrev==0 );
+ lsmPutU32(&pPg->aData[pPg->nData], iNext);
+ }else{
+ int nData = pPg->nData;
+ pPg->nData += 4;
+ lsmSortedExpandBtreePage(pPg, nData);
+ }
+
+ pPg->nRef++;
+ for(pp=&pFS->pWaiting; *pp; pp=&(*pp)->pWaitingNext);
+ *pp = pPg;
+ assert( pPg->pWaitingNext==0 );
+
+ }else{
+ i64 iOff; /* Offset to write within database file */
+
+ iOff = (i64)pFS->nPagesize * (i64)(pPg->iPg-1);
+ if( fsMmapPage(pFS, pPg->iPg)==0 ){
+ u8 *aData = pPg->aData - (pPg->flags & PAGE_HASPREV);
+ rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, aData, pFS->nPagesize);
+ }else if( pPg->flags & PAGE_FREE ){
+ fsGrowMapping(pFS, iOff + pFS->nPagesize, &rc);
+ if( rc==LSM_OK ){
+ u8 *aTo = &((u8 *)(pFS->pMap))[iOff];
+ u8 *aFrom = pPg->aData - (pPg->flags & PAGE_HASPREV);
+ memcpy(aTo, aFrom, pFS->nPagesize);
+ lsmFree(pFS->pEnv, aFrom);
+ pFS->nCacheAlloc--;
+ pPg->aData = aTo + (pPg->flags & PAGE_HASPREV);
+ pPg->flags &= ~PAGE_FREE;
+ fsPageRemoveFromHash(pFS, pPg);
+ pPg->pMappedNext = pFS->pMapped;
+ pFS->pMapped = pPg;
+ }
+ }
+
+ lsmFsFlushWaiting(pFS, &rc);
+ pPg->flags &= ~PAGE_DIRTY;
+ pFS->nWrite++;
+ }
+ }
+ }
+
+ return rc;
+}
+
+/*
+** For non-compressed databases, this function is a no-op. For compressed
+** databases, it adds a padding record to the segment passed as the third
+** argument.
+**
+** The size of the padding records is selected so that the last byte
+** written is the last byte of a disk sector. This means that if a
+** snapshot is taken and checkpointed, subsequent worker processes will
+** not write to any sector that contains checkpointed data.
+*/
+int lsmFsSortedPadding(
+ FileSystem *pFS,
+ Snapshot *pSnapshot,
+ Segment *pSeg
+){
+ int rc = LSM_OK;
+ if( pFS->pCompress ){
+ Pgno iLast2;
+ Pgno iLast = pSeg->iLastPg; /* Current last page of segment */
+ int nPad; /* Bytes of padding required */
+ u8 aSz[3];
+
+ iLast2 = (1 + iLast/pFS->szSector) * pFS->szSector - 1;
+ assert( fsPageToBlock(pFS, iLast)==fsPageToBlock(pFS, iLast2) );
+ nPad = (int)(iLast2 - iLast);
+
+ if( iLast2>fsLastPageOnPagesBlock(pFS, iLast) ){
+ nPad -= 4;
+ }
+ assert( nPad>=0 );
+
+ if( nPad>=6 ){
+ pSeg->nSize += nPad;
+ nPad -= 6;
+ putRecordSize(aSz, nPad, 1);
+ fsAppendData(pFS, pSeg, aSz, sizeof(aSz), &rc);
+ memset(pFS->aOBuffer, 0, nPad);
+ fsAppendData(pFS, pSeg, pFS->aOBuffer, nPad, &rc);
+ fsAppendData(pFS, pSeg, aSz, sizeof(aSz), &rc);
+ }else if( nPad>0 ){
+ u8 aBuf[5] = {0,0,0,0,0};
+ aBuf[0] = (u8)nPad;
+ aBuf[nPad-1] = (u8)nPad;
+ fsAppendData(pFS, pSeg, aBuf, nPad, &rc);
+ }
+
+ assert( rc!=LSM_OK
+ || pSeg->iLastPg==fsLastPageOnPagesBlock(pFS, pSeg->iLastPg)
+ || ((pSeg->iLastPg + 1) % pFS->szSector)==0
+ );
+ }
+
+ return rc;
+}
+
+
+/*
+** Increment the reference count on the page object passed as the first
+** argument.
+*/
+void lsmFsPageRef(Page *pPg){
+ if( pPg ){
+ pPg->nRef++;
+ }
+}
+
+/*
+** Release a page-reference obtained using fsPageGet().
+*/
+int lsmFsPageRelease(Page *pPg){
+ int rc = LSM_OK;
+ if( pPg ){
+ assert( pPg->nRef>0 );
+ pPg->nRef--;
+ if( pPg->nRef==0 ){
+ FileSystem *pFS = pPg->pFS;
+ rc = lsmFsPagePersist(pPg);
+ pFS->nOut--;
+
+ assert( pPg->pFS->pCompress
+ || fsIsFirst(pPg->pFS, pPg->iPg)==0
+ || (pPg->flags & PAGE_HASPREV)
+ );
+ pPg->aData -= (pPg->flags & PAGE_HASPREV);
+ pPg->flags &= ~PAGE_HASPREV;
+
+ if( (pPg->flags & PAGE_FREE)==0 ){
+ /* Removed from mapped list */
+ Page **pp;
+ for(pp=&pFS->pMapped; (*pp)!=pPg; pp=&(*pp)->pMappedNext);
+ *pp = pPg->pMappedNext;
+ pPg->pMappedNext = 0;
+
+ /* Add to free list */
+ pPg->pFreeNext = pFS->pFree;
+ pFS->pFree = pPg;
+ }else{
+ fsPageAddToLru(pFS, pPg);
+ }
+ }
+ }
+
+ return rc;
+}
+
+/*
+** Return the total number of pages read from the database file.
+*/
+int lsmFsNRead(FileSystem *pFS){ return pFS->nRead; }
+
+/*
+** Return the total number of pages written to the database file.
+*/
+int lsmFsNWrite(FileSystem *pFS){ return pFS->nWrite; }
+
+/*
+** Return a copy of the environment pointer used by the file-system object.
+*/
+lsm_env *lsmFsEnv(FileSystem *pFS){
+ return pFS->pEnv;
+}
+
+/*
+** Return a copy of the environment pointer used by the file-system object
+** to which this page belongs.
+*/
+lsm_env *lsmPageEnv(Page *pPg) {
+ return pPg->pFS->pEnv;
+}
+
+/*
+** Return a pointer to the file-system object associated with the Page
+** passed as the only argument.
+*/
+FileSystem *lsmPageFS(Page *pPg){
+ return pPg->pFS;
+}
+
+/*
+** Return the sector-size as reported by the log file handle.
+*/
+int lsmFsSectorSize(FileSystem *pFS){
+ return pFS->szSector;
+}
+
+/*
+** Helper function for lsmInfoArrayStructure().
+*/
+static Segment *startsWith(Segment *pRun, Pgno iFirst){
+ return (iFirst==pRun->iFirst) ? pRun : 0;
+}
+
+/*
+** Return the segment that starts with page iFirst, if any. If no such segment
+** can be found, return NULL.
+*/
+static Segment *findSegment(Snapshot *pWorker, Pgno iFirst){
+ Level *pLvl; /* Used to iterate through db levels */
+ Segment *pSeg = 0; /* Pointer to segment to return */
+
+ for(pLvl=lsmDbSnapshotLevel(pWorker); pLvl && pSeg==0; pLvl=pLvl->pNext){
+ if( 0==(pSeg = startsWith(&pLvl->lhs, iFirst)) ){
+ int i;
+ for(i=0; inRight; i++){
+ if( (pSeg = startsWith(&pLvl->aRhs[i], iFirst)) ) break;
+ }
+ }
+ }
+
+ return pSeg;
+}
+
+/*
+** This function implements the lsm_info(LSM_INFO_ARRAY_STRUCTURE) request.
+** If successful, *pzOut is set to point to a nul-terminated string
+** containing the array structure and LSM_OK is returned. The caller should
+** eventually free the string using lsmFree().
+**
+** If an error occurs, *pzOut is set to NULL and an LSM error code returned.
+*/
+int lsmInfoArrayStructure(
+ lsm_db *pDb,
+ int bBlock, /* True for block numbers only */
+ Pgno iFirst,
+ char **pzOut
+){
+ int rc = LSM_OK;
+ Snapshot *pWorker; /* Worker snapshot */
+ Segment *pArray = 0; /* Array to report on */
+ int bUnlock = 0;
+
+ *pzOut = 0;
+ if( iFirst==0 ) return LSM_ERROR;
+
+ /* Obtain the worker snapshot */
+ pWorker = pDb->pWorker;
+ if( !pWorker ){
+ rc = lsmBeginWork(pDb);
+ if( rc!=LSM_OK ) return rc;
+ pWorker = pDb->pWorker;
+ bUnlock = 1;
+ }
+
+ /* Search for the array that starts on page iFirst */
+ pArray = findSegment(pWorker, iFirst);
+
+ if( pArray==0 ){
+ /* Could not find the requested array. This is an error. */
+ rc = LSM_ERROR;
+ }else{
+ FileSystem *pFS = pDb->pFS;
+ LsmString str;
+ int iBlk;
+ int iLastBlk;
+
+ iBlk = fsPageToBlock(pFS, pArray->iFirst);
+ iLastBlk = fsPageToBlock(pFS, pArray->iLastPg);
+
+ lsmStringInit(&str, pDb->pEnv);
+ if( bBlock ){
+ lsmStringAppendf(&str, "%d", iBlk);
+ while( iBlk!=iLastBlk ){
+ fsBlockNext(pFS, pArray, iBlk, &iBlk);
+ lsmStringAppendf(&str, " %d", iBlk);
+ }
+ }else{
+ lsmStringAppendf(&str, "%d", pArray->iFirst);
+ while( iBlk!=iLastBlk ){
+ lsmStringAppendf(&str, " %d", fsLastPageOnBlock(pFS, iBlk));
+ fsBlockNext(pFS, pArray, iBlk, &iBlk);
+ lsmStringAppendf(&str, " %d", fsFirstPageOnBlock(pFS, iBlk));
+ }
+ lsmStringAppendf(&str, " %d", pArray->iLastPg);
+ }
+
+ *pzOut = str.z;
+ }
+
+ if( bUnlock ){
+ int rcwork = LSM_BUSY;
+ lsmFinishWork(pDb, 0, &rcwork);
+ }
+ return rc;
+}
+
+int lsmFsSegmentContainsPg(
+ FileSystem *pFS,
+ Segment *pSeg,
+ Pgno iPg,
+ int *pbRes
+){
+ Redirect *pRedir = pSeg->pRedirect;
+ int rc = LSM_OK;
+ int iBlk;
+ int iLastBlk;
+ int iPgBlock; /* Block containing page iPg */
+
+ iPgBlock = fsPageToBlock(pFS, pSeg->iFirst);
+ iBlk = fsRedirectBlock(pRedir, fsPageToBlock(pFS, pSeg->iFirst));
+ iLastBlk = fsRedirectBlock(pRedir, fsPageToBlock(pFS, pSeg->iLastPg));
+
+ while( iBlk!=iLastBlk && iBlk!=iPgBlock && rc==LSM_OK ){
+ rc = fsBlockNext(pFS, pSeg, iBlk, &iBlk);
+ }
+
+ *pbRes = (iBlk==iPgBlock);
+ return rc;
+}
+
+/*
+** This function implements the lsm_info(LSM_INFO_ARRAY_PAGES) request.
+** If successful, *pzOut is set to point to a nul-terminated string
+** containing the array structure and LSM_OK is returned. The caller should
+** eventually free the string using lsmFree().
+**
+** If an error occurs, *pzOut is set to NULL and an LSM error code returned.
+*/
+int lsmInfoArrayPages(lsm_db *pDb, Pgno iFirst, char **pzOut){
+ int rc = LSM_OK;
+ Snapshot *pWorker; /* Worker snapshot */
+ Segment *pSeg = 0; /* Array to report on */
+ int bUnlock = 0;
+
+ *pzOut = 0;
+ if( iFirst==0 ) return LSM_ERROR;
+
+ /* Obtain the worker snapshot */
+ pWorker = pDb->pWorker;
+ if( !pWorker ){
+ rc = lsmBeginWork(pDb);
+ if( rc!=LSM_OK ) return rc;
+ pWorker = pDb->pWorker;
+ bUnlock = 1;
+ }
+
+ /* Search for the array that starts on page iFirst */
+ pSeg = findSegment(pWorker, iFirst);
+
+ if( pSeg==0 ){
+ /* Could not find the requested array. This is an error. */
+ rc = LSM_ERROR;
+ }else{
+ Page *pPg = 0;
+ FileSystem *pFS = pDb->pFS;
+ LsmString str;
+
+ lsmStringInit(&str, pDb->pEnv);
+ rc = lsmFsDbPageGet(pFS, pSeg, iFirst, &pPg);
+ while( rc==LSM_OK && pPg ){
+ Page *pNext = 0;
+ lsmStringAppendf(&str, " %lld", lsmFsPageNumber(pPg));
+ rc = lsmFsDbPageNext(pSeg, pPg, 1, &pNext);
+ lsmFsPageRelease(pPg);
+ pPg = pNext;
+ }
+
+ if( rc!=LSM_OK ){
+ lsmFree(pDb->pEnv, str.z);
+ }else{
+ *pzOut = str.z;
+ }
+ }
+
+ if( bUnlock ){
+ int rcwork = LSM_BUSY;
+ lsmFinishWork(pDb, 0, &rcwork);
+ }
+ return rc;
+}
+
+/*
+** The following macros are used by the integrity-check code. Associated with
+** each block in the database is an 8-bit bit mask (the entry in the aUsed[]
+** array). As the integrity-check meanders through the database, it sets the
+** following bits to indicate how each block is used.
+**
+** INTEGRITY_CHECK_FIRST_PG:
+** First page of block is in use by sorted run.
+**
+** INTEGRITY_CHECK_LAST_PG:
+** Last page of block is in use by sorted run.
+**
+** INTEGRITY_CHECK_USED:
+** At least one page of the block is in use by a sorted run.
+**
+** INTEGRITY_CHECK_FREE:
+** The free block list contains an entry corresponding to this block.
+*/
+#define INTEGRITY_CHECK_FIRST_PG 0x01
+#define INTEGRITY_CHECK_LAST_PG 0x02
+#define INTEGRITY_CHECK_USED 0x04
+#define INTEGRITY_CHECK_FREE 0x08
+
+/*
+** Helper function for lsmFsIntegrityCheck()
+*/
+static void checkBlocks(
+ FileSystem *pFS,
+ Segment *pSeg,
+ int bExtra, /* If true, count the "next" block if any */
+ int nUsed,
+ u8 *aUsed
+){
+ if( pSeg ){
+ if( pSeg && pSeg->nSize>0 ){
+ int rc;
+ int iBlk; /* Current block (during iteration) */
+ int iLastBlk; /* Last block of segment */
+ int iFirstBlk; /* First block of segment */
+ int bLastIsLastOnBlock; /* True iLast is the last on its block */
+
+ assert( 0==fsSegmentRedirects(pFS, pSeg) );
+ iBlk = iFirstBlk = fsPageToBlock(pFS, pSeg->iFirst);
+ iLastBlk = fsPageToBlock(pFS, pSeg->iLastPg);
+
+ bLastIsLastOnBlock = (fsLastPageOnBlock(pFS, iLastBlk)==pSeg->iLastPg);
+ assert( iBlk>0 );
+
+ do {
+ /* iBlk is a part of this sorted run. */
+ aUsed[iBlk-1] |= INTEGRITY_CHECK_USED;
+
+ /* If the first page of this block is also part of the segment,
+ ** set the flag to indicate that the first page of iBlk is in use.
+ */
+ if( fsFirstPageOnBlock(pFS, iBlk)==pSeg->iFirst || iBlk!=iFirstBlk ){
+ assert( (aUsed[iBlk-1] & INTEGRITY_CHECK_FIRST_PG)==0 );
+ aUsed[iBlk-1] |= INTEGRITY_CHECK_FIRST_PG;
+ }
+
+ /* Unless the sorted run finishes before the last page on this block,
+ ** the last page of this block is also in use. */
+ if( iBlk!=iLastBlk || bLastIsLastOnBlock ){
+ assert( (aUsed[iBlk-1] & INTEGRITY_CHECK_LAST_PG)==0 );
+ aUsed[iBlk-1] |= INTEGRITY_CHECK_LAST_PG;
+ }
+
+ /* Special case. The sorted run being scanned is the output run of
+ ** a level currently undergoing an incremental merge. The sorted
+ ** run ends on the last page of iBlk, but the next block has already
+ ** been allocated. So mark it as in use as well. */
+ if( iBlk==iLastBlk && bLastIsLastOnBlock && bExtra ){
+ int iExtra = 0;
+ rc = fsBlockNext(pFS, pSeg, iBlk, &iExtra);
+ assert( rc==LSM_OK );
+
+ assert( aUsed[iExtra-1]==0 );
+ aUsed[iExtra-1] |= INTEGRITY_CHECK_USED;
+ aUsed[iExtra-1] |= INTEGRITY_CHECK_FIRST_PG;
+ aUsed[iExtra-1] |= INTEGRITY_CHECK_LAST_PG;
+ }
+
+ /* Move on to the next block in the sorted run. Or set iBlk to zero
+ ** in order to break out of the loop if this was the last block in
+ ** the run. */
+ if( iBlk==iLastBlk ){
+ iBlk = 0;
+ }else{
+ rc = fsBlockNext(pFS, pSeg, iBlk, &iBlk);
+ assert( rc==LSM_OK );
+ }
+ }while( iBlk );
+ }
+ }
+}
+
+typedef struct CheckFreelistCtx CheckFreelistCtx;
+struct CheckFreelistCtx {
+ u8 *aUsed;
+ int nBlock;
+};
+static int checkFreelistCb(void *pCtx, int iBlk, i64 iSnapshot){
+ CheckFreelistCtx *p = (CheckFreelistCtx *)pCtx;
+
+ assert( iBlk>=1 );
+ assert( iBlk<=p->nBlock );
+ assert( p->aUsed[iBlk-1]==0 );
+ p->aUsed[iBlk-1] = INTEGRITY_CHECK_FREE;
+ return 0;
+}
+
+/*
+** This function checks that all blocks in the database file are accounted
+** for. For each block, exactly one of the following must be true:
+**
+** + the block is part of a sorted run, or
+** + the block is on the free-block list
+**
+** This function also checks that there are no references to blocks with
+** out-of-range block numbers.
+**
+** If no errors are found, non-zero is returned. If an error is found, an
+** assert() fails.
+*/
+int lsmFsIntegrityCheck(lsm_db *pDb){
+ CheckFreelistCtx ctx;
+ FileSystem *pFS = pDb->pFS;
+ int i;
+ int rc;
+ Freelist freelist = {0, 0, 0};
+ u8 *aUsed;
+ Level *pLevel;
+ Snapshot *pWorker = pDb->pWorker;
+ int nBlock = pWorker->nBlock;
+
+#if 0
+ static int nCall = 0;
+ nCall++;
+ printf("%d calls\n", nCall);
+#endif
+
+ aUsed = lsmMallocZero(pDb->pEnv, nBlock);
+ if( aUsed==0 ){
+ /* Malloc has failed. Since this function is only called within debug
+ ** builds, this probably means the user is running an OOM injection test.
+ ** Regardless, it will not be possible to run the integrity-check at this
+ ** time, so assume the database is Ok and return non-zero. */
+ return 1;
+ }
+
+ for(pLevel=pWorker->pLevel; pLevel; pLevel=pLevel->pNext){
+ int j;
+ checkBlocks(pFS, &pLevel->lhs, (pLevel->nRight!=0), nBlock, aUsed);
+ for(j=0; jnRight; j++){
+ checkBlocks(pFS, &pLevel->aRhs[j], 0, nBlock, aUsed);
+ }
+ }
+
+ /* Mark all blocks in the free-list as used */
+ ctx.aUsed = aUsed;
+ ctx.nBlock = nBlock;
+ rc = lsmWalkFreelist(pDb, 0, checkFreelistCb, (void *)&ctx);
+
+ if( rc==LSM_OK ){
+ for(i=0; ipEnv, aUsed);
+ lsmFree(pDb->pEnv, freelist.aEntry);
+
+ return 1;
+}
+
+#ifndef NDEBUG
+/*
+** Return true if pPg happens to be the last page in segment pSeg. Or false
+** otherwise. This function is only invoked as part of assert() conditions.
+*/
+int lsmFsDbPageIsLast(Segment *pSeg, Page *pPg){
+ if( pPg->pFS->pCompress ){
+ Pgno iNext = 0;
+ int rc;
+ rc = fsNextPageOffset(pPg->pFS, pSeg, pPg->iPg, pPg->nCompress+6, &iNext);
+ return (rc!=LSM_OK || iNext==0);
+ }
+ return (pPg->iPg==pSeg->iLastPg);
+}
+#endif
diff --git a/ext/lsm1/lsm_log.c b/ext/lsm1/lsm_log.c
new file mode 100644
index 0000000..a66e40b
--- /dev/null
+++ b/ext/lsm1/lsm_log.c
@@ -0,0 +1,1156 @@
+/*
+** 2011-08-13
+**
+** The author disclaims copyright to this source code. In place of
+** a legal notice, here is a blessing:
+**
+** May you do good and not evil.
+** May you find forgiveness for yourself and forgive others.
+** May you share freely, never taking more than you give.
+**
+*************************************************************************
+**
+** This file contains the implementation of LSM database logging. Logging
+** has one purpose in LSM - to make transactions durable.
+**
+** When data is written to an LSM database, it is initially stored in an
+** in-memory tree structure. Since this structure is in volatile memory,
+** if a power failure or application crash occurs it may be lost. To
+** prevent loss of data in this case, each time a record is written to the
+** in-memory tree an equivalent record is appended to the log on disk.
+** If a power failure or application crash does occur, data can be recovered
+** by reading the log.
+**
+** A log file consists of the following types of records representing data
+** written into the database:
+**
+** LOG_WRITE: A key-value pair written to the database.
+** LOG_DELETE: A delete key issued to the database.
+** LOG_COMMIT: A transaction commit.
+**
+** And the following types of records for ancillary purposes..
+**
+** LOG_EOF: A record indicating the end of a log file.
+** LOG_PAD1: A single byte padding record.
+** LOG_PAD2: An N byte padding record (N>1).
+** LOG_JUMP: A pointer to another offset within the log file.
+**
+** Each transaction written to the log contains one or more LOG_WRITE and/or
+** LOG_DELETE records, followed by a LOG_COMMIT record. The LOG_COMMIT record
+** contains an 8-byte checksum based on all previous data written to the
+** log file.
+**
+** LOG CHECKSUMS & RECOVERY
+**
+** Checksums are found in two types of log records: LOG_COMMIT and
+** LOG_CKSUM records. In order to recover content from a log, a client
+** reads each record from the start of the log, calculating a checksum as
+** it does. Each time a LOG_COMMIT or LOG_CKSUM is encountered, the
+** recovery process verifies that the checksum stored in the log
+** matches the calculated checksum. If it does not, the recovery process
+** can stop reading the log.
+**
+** If a recovery process reads records (other than COMMIT or CKSUM)
+** consisting of at least LSM_CKSUM_MAXDATA bytes, then the next record in
+** the log must be either a LOG_CKSUM or LOG_COMMIT record. If it is
+** not, the recovery process also stops reading the log.
+**
+** To recover the log file, it must be read twice. The first time to
+** determine the location of the last valid commit record. And the second
+** time to load data into the in-memory tree.
+**
+** Todo: Surely there is a better way...
+**
+** LOG WRAPPING
+**
+** If the log file were never deleted or wrapped, it would be possible to
+** read it from start to end each time is required recovery (i.e each time
+** the number of database clients changes from 0 to 1). Effectively reading
+** the entire history of the database each time. This would quickly become
+** inefficient. Additionally, since the log file would grow without bound,
+** it wastes storage space.
+**
+** Instead, part of each checkpoint written into the database file contains
+** a log offset (and other information required to read the log starting at
+** at this offset) at which to begin recovery. Offset $O.
+**
+** Once a checkpoint has been written and synced into the database file, it
+** is guaranteed that no recovery process will need to read any data before
+** offset $O of the log file. It is therefore safe to begin overwriting
+** any data that occurs before offset $O.
+**
+** This implementation separates the log into three regions mapped into
+** the log file - regions 0, 1 and 2. During recovery, regions are read
+** in ascending order (i.e. 0, then 1, then 2). Each region is zero or
+** more bytes in size.
+**
+** |---1---|..|--0--|.|--2--|....
+**
+** New records are always appended to the end of region 2.
+**
+** Initially (when it is empty), all three regions are zero bytes in size.
+** Each of them are located at the beginning of the file. As records are
+** added to the log, region 2 grows, so that the log consists of a zero
+** byte region 1, followed by a zero byte region 0, followed by an N byte
+** region 2. After one or more checkpoints have been written to disk,
+** the start point of region 2 is moved to $O. For example:
+**
+** A) ||.........|--2--|....
+**
+** (both regions 0 and 1 are 0 bytes in size at offset 0).
+**
+** Eventually, the log wraps around to write new records into the start.
+** At this point, region 2 is renamed to region 0. Region 0 is renamed
+** to region 2. After appending a few records to the new region 2, the
+** log file looks like this:
+**
+** B) ||--2--|...|--0--|....
+**
+** (region 1 is still 0 bytes in size, located at offset 0).
+**
+** Any checkpoints made at this point may reduce the size of region 0.
+** However, if they do not, and region 2 expands so that it is about to
+** overwrite the start of region 0, then region 2 is renamed to region 1,
+** and a new region 2 created at the end of the file following the existing
+** region 0.
+**
+** C) |---1---|..|--0--|.|-2-|
+**
+** In this state records are appended to region 2 until checkpoints have
+** contracted regions 0 AND 1 UNTil they are both zero bytes in size. They
+** are then shifted to the start of the log file, leaving the system in
+** the equivalent of state A above.
+**
+** Alternatively, state B may transition directly to state A if the size
+** of region 0 is reduced to zero bytes before region 2 threatens to
+** encroach upon it.
+**
+** LOG_PAD1 & LOG_PAD2 RECORDS
+**
+** PAD1 and PAD2 records may appear in a log file at any point. They allow
+** a process writing the log file align the beginning of transactions with
+** the beginning of disk sectors, which increases robustness.
+**
+** RECORD FORMATS:
+**
+** LOG_EOF: * A single 0x00 byte.
+**
+** LOG_PAD1: * A single 0x01 byte.
+**
+** LOG_PAD2: * A single 0x02 byte, followed by
+** * The number of unused bytes (N) as a varint,
+** * An N byte block of unused space.
+**
+** LOG_COMMIT: * A single 0x03 byte.
+** * An 8-byte checksum.
+**
+** LOG_JUMP: * A single 0x04 byte.
+** * Absolute file offset to jump to, encoded as a varint.
+**
+** LOG_WRITE: * A single 0x06 or 0x07 byte,
+** * The number of bytes in the key, encoded as a varint,
+** * The number of bytes in the value, encoded as a varint,
+** * If the first byte was 0x07, an 8 byte checksum.
+** * The key data,
+** * The value data.
+**
+** LOG_DELETE: * A single 0x08 or 0x09 byte,
+** * The number of bytes in the key, encoded as a varint,
+** * If the first byte was 0x09, an 8 byte checksum.
+** * The key data.
+**
+** Varints are as described in lsm_varint.c (SQLite 4 format).
+**
+** CHECKSUMS:
+**
+** The checksum is calculated using two 32-bit unsigned integers, s0 and
+** s1. The initial value for both is 42. It is updated each time a record
+** is written into the log file by treating the encoded (binary) record as
+** an array of 32-bit little-endian integers. Then, if x[] is the integer
+** array, updating the checksum accumulators as follows:
+**
+** for i from 0 to n-1 step 2:
+** s0 += x[i] + s1;
+** s1 += x[i+1] + s0;
+** endfor
+**
+** If the record is not an even multiple of 8-bytes in size it is padded
+** with zeroes to make it so before the checksum is updated.
+**
+** The checksum stored in a COMMIT, WRITE or DELETE is based on all bytes
+** up to the start of the 8-byte checksum itself, including the COMMIT,
+** WRITE or DELETE fields that appear before the checksum in the record.
+**
+** VARINT FORMAT
+**
+** See lsm_varint.c.
+*/
+
+#ifndef _LSM_INT_H
+# include "lsmInt.h"
+#endif
+
+/* Log record types */
+#define LSM_LOG_EOF 0x00
+#define LSM_LOG_PAD1 0x01
+#define LSM_LOG_PAD2 0x02
+#define LSM_LOG_COMMIT 0x03
+#define LSM_LOG_JUMP 0x04
+
+#define LSM_LOG_WRITE 0x06
+#define LSM_LOG_WRITE_CKSUM 0x07
+
+#define LSM_LOG_DELETE 0x08
+#define LSM_LOG_DELETE_CKSUM 0x09
+
+#define LSM_LOG_DRANGE 0x0A
+#define LSM_LOG_DRANGE_CKSUM 0x0B
+
+/* Require a checksum every 32KB. */
+#define LSM_CKSUM_MAXDATA (32*1024)
+
+/* Do not wrap a log file smaller than this in bytes. */
+#define LSM_MIN_LOGWRAP (128*1024)
+
+/*
+** szSector:
+** Commit records must be aligned to end on szSector boundaries. If
+** the safety-mode is set to NORMAL or OFF, this value is 1. Otherwise,
+** if the safety-mode is set to FULL, it is the size of the file-system
+** sectors as reported by lsmFsSectorSize().
+*/
+struct LogWriter {
+ u32 cksum0; /* Checksum 0 at offset iOff */
+ u32 cksum1; /* Checksum 1 at offset iOff */
+ int iCksumBuf; /* Bytes of buf that have been checksummed */
+ i64 iOff; /* Offset at start of buffer buf */
+ int szSector; /* Sector size for this transaction */
+ LogRegion jump; /* Avoid writing to this region */
+ i64 iRegion1End; /* End of first region written by trans */
+ i64 iRegion2Start; /* Start of second regions written by trans */
+ LsmString buf; /* Buffer containing data not yet written */
+};
+
+/*
+** Return the result of interpreting the first 4 bytes in buffer aIn as
+** a 32-bit unsigned little-endian integer.
+*/
+static u32 getU32le(u8 *aIn){
+ return ((u32)aIn[3] << 24)
+ + ((u32)aIn[2] << 16)
+ + ((u32)aIn[1] << 8)
+ + ((u32)aIn[0]);
+}
+
+
+/*
+** This function is the same as logCksum(), except that pointer "a" need
+** not be aligned to an 8-byte boundary or padded with zero bytes. This
+** version is slower, but sometimes more convenient to use.
+*/
+static void logCksumUnaligned(
+ char *z, /* Input buffer */
+ int n, /* Size of input buffer in bytes */
+ u32 *pCksum0, /* IN/OUT: Checksum value 1 */
+ u32 *pCksum1 /* IN/OUT: Checksum value 2 */
+){
+ u8 *a = (u8 *)z;
+ u32 cksum0 = *pCksum0;
+ u32 cksum1 = *pCksum1;
+ int nIn = (n/8) * 8;
+ int i;
+
+ assert( n>0 );
+ for(i=0; inIn );
+ memcpy(aBuf, &a[nIn], n-nIn);
+ cksum0 += getU32le(aBuf) + cksum1;
+ cksum1 += getU32le(&aBuf[4]) + cksum0;
+ }
+
+ *pCksum0 = cksum0;
+ *pCksum1 = cksum1;
+}
+
+/*
+** Update pLog->cksum0 and pLog->cksum1 so that the first nBuf bytes in the
+** write buffer (pLog->buf) are included in the checksum.
+*/
+static void logUpdateCksum(LogWriter *pLog, int nBuf){
+ assert( (pLog->iCksumBuf % 8)==0 );
+ assert( pLog->iCksumBuf<=nBuf );
+ assert( (nBuf % 8)==0 || nBuf==pLog->buf.n );
+ if( nBuf>pLog->iCksumBuf ){
+ logCksumUnaligned(
+ &pLog->buf.z[pLog->iCksumBuf], nBuf-pLog->iCksumBuf,
+ &pLog->cksum0, &pLog->cksum1
+ );
+ }
+ pLog->iCksumBuf = nBuf;
+}
+
+static i64 firstByteOnSector(LogWriter *pLog, i64 iOff){
+ return (iOff / pLog->szSector) * pLog->szSector;
+}
+static i64 lastByteOnSector(LogWriter *pLog, i64 iOff){
+ return firstByteOnSector(pLog, iOff) + pLog->szSector - 1;
+}
+
+/*
+** If possible, reclaim log file space. Log file space is reclaimed after
+** a snapshot that points to the same data in the database file is synced
+** into the db header.
+*/
+static int logReclaimSpace(lsm_db *pDb){
+ int rc;
+ int iMeta;
+ int bRotrans; /* True if there exists some ro-trans */
+
+ /* Test if there exists some other connection with a read-only transaction
+ ** open. If there does, then log file space may not be reclaimed. */
+ rc = lsmDetectRoTrans(pDb, &bRotrans);
+ if( rc!=LSM_OK || bRotrans ) return rc;
+
+ iMeta = (int)pDb->pShmhdr->iMetaPage;
+ if( iMeta==1 || iMeta==2 ){
+ DbLog *pLog = &pDb->treehdr.log;
+ i64 iSyncedId;
+
+ /* Read the snapshot-id of the snapshot stored on meta-page iMeta. Note
+ ** that in theory, the value read is untrustworthy (due to a race
+ ** condition - see comments above lsmFsReadSyncedId()). So it is only
+ ** ever used to conclude that no log space can be reclaimed. If it seems
+ ** to indicate that it may be possible to reclaim log space, a
+ ** second call to lsmCheckpointSynced() (which does return trustworthy
+ ** values) is made below to confirm. */
+ rc = lsmFsReadSyncedId(pDb, iMeta, &iSyncedId);
+
+ if( rc==LSM_OK && pLog->iSnapshotId!=iSyncedId ){
+ i64 iSnapshotId = 0;
+ i64 iOff = 0;
+ rc = lsmCheckpointSynced(pDb, &iSnapshotId, &iOff, 0);
+ if( rc==LSM_OK && pLog->iSnapshotIdaRegion[iRegion];
+ if( iOff>=p->iStart && iOff<=p->iEnd ) break;
+ p->iStart = 0;
+ p->iEnd = 0;
+ }
+ assert( iRegion<3 );
+ pLog->aRegion[iRegion].iStart = iOff;
+ pLog->iSnapshotId = iSnapshotId;
+ }
+ }
+ }
+ return rc;
+}
+
+/*
+** This function is called when a write-transaction is first opened. It
+** is assumed that the caller is holding the client-mutex when it is
+** called.
+**
+** Before returning, this function allocates the LogWriter object that
+** will be used to write to the log file during the write transaction.
+** LSM_OK is returned if no error occurs, otherwise an LSM error code.
+*/
+int lsmLogBegin(lsm_db *pDb){
+ int rc = LSM_OK;
+ LogWriter *pNew;
+ LogRegion *aReg;
+
+ if( pDb->bUseLog==0 ) return LSM_OK;
+
+ /* If the log file has not yet been opened, open it now. Also allocate
+ ** the LogWriter structure, if it has not already been allocated. */
+ rc = lsmFsOpenLog(pDb, 0);
+ if( pDb->pLogWriter==0 ){
+ pNew = lsmMallocZeroRc(pDb->pEnv, sizeof(LogWriter), &rc);
+ if( pNew ){
+ lsmStringInit(&pNew->buf, pDb->pEnv);
+ rc = lsmStringExtend(&pNew->buf, 2);
+ }
+ pDb->pLogWriter = pNew;
+ }else{
+ pNew = pDb->pLogWriter;
+ assert( (u8 *)(&pNew[1])==(u8 *)(&((&pNew->buf)[1])) );
+ memset(pNew, 0, ((u8 *)&pNew->buf) - (u8 *)pNew);
+ pNew->buf.n = 0;
+ }
+
+ if( rc==LSM_OK ){
+ /* The following call detects whether or not a new snapshot has been
+ ** synced into the database file. If so, it updates the contents of
+ ** the pDb->treehdr.log structure to reclaim any space in the log
+ ** file that is no longer required.
+ **
+ ** TODO: Calling this every transaction is overkill. And since the
+ ** call has to read and checksum a snapshot from the database file,
+ ** it is expensive. It would be better to figure out a way so that
+ ** this is only called occasionally - say for every 32KB written to
+ ** the log file.
+ */
+ rc = logReclaimSpace(pDb);
+ }
+ if( rc!=LSM_OK ){
+ lsmLogClose(pDb);
+ return rc;
+ }
+
+ /* Set the effective sector-size for this transaction. Sectors are assumed
+ ** to be one byte in size if the safety-mode is OFF or NORMAL, or as
+ ** reported by lsmFsSectorSize if it is FULL. */
+ if( pDb->eSafety==LSM_SAFETY_FULL ){
+ pNew->szSector = lsmFsSectorSize(pDb->pFS);
+ assert( pNew->szSector>0 );
+ }else{
+ pNew->szSector = 1;
+ }
+
+ /* There are now three scenarios:
+ **
+ ** 1) Regions 0 and 1 are both zero bytes in size and region 2 begins
+ ** at a file offset greater than LSM_MIN_LOGWRAP. In this case, wrap
+ ** around to the start and write data into the start of the log file.
+ **
+ ** 2) Region 1 is zero bytes in size and region 2 occurs earlier in the
+ ** file than region 0. In this case, append data to region 2, but
+ ** remember to jump over region 1 if required.
+ **
+ ** 3) Region 2 is the last in the file. Append to it.
+ */
+ aReg = &pDb->treehdr.log.aRegion[0];
+
+ assert( aReg[0].iEnd==0 || aReg[0].iEnd>aReg[0].iStart );
+ assert( aReg[1].iEnd==0 || aReg[1].iEnd>aReg[1].iStart );
+
+ pNew->cksum0 = pDb->treehdr.log.cksum0;
+ pNew->cksum1 = pDb->treehdr.log.cksum1;
+
+ if( aReg[0].iEnd==0 && aReg[1].iEnd==0 && aReg[2].iStart>=LSM_MIN_LOGWRAP ){
+ /* Case 1. Wrap around to the start of the file. Write an LSM_LOG_JUMP
+ ** into the log file in this case. Pad it out to 8 bytes using a PAD2
+ ** record so that the checksums can be updated immediately. */
+ u8 aJump[] = {
+ LSM_LOG_PAD2, 0x04, 0x00, 0x00, 0x00, 0x00, LSM_LOG_JUMP, 0x00
+ };
+
+ lsmStringBinAppend(&pNew->buf, aJump, sizeof(aJump));
+ logUpdateCksum(pNew, pNew->buf.n);
+ rc = lsmFsWriteLog(pDb->pFS, aReg[2].iEnd, &pNew->buf);
+ pNew->iCksumBuf = pNew->buf.n = 0;
+
+ aReg[2].iEnd += 8;
+ pNew->jump = aReg[0] = aReg[2];
+ aReg[2].iStart = aReg[2].iEnd = 0;
+ }else if( aReg[1].iEnd==0 && aReg[2].iEndiOff = aReg[2].iEnd;
+ pNew->jump = aReg[0];
+ }else{
+ /* Case 3. */
+ assert( aReg[2].iStart>=aReg[0].iEnd && aReg[2].iStart>=aReg[1].iEnd );
+ pNew->iOff = aReg[2].iEnd;
+ }
+
+ if( pNew->jump.iStart ){
+ i64 iRound;
+ assert( pNew->jump.iStart>pNew->iOff );
+
+ iRound = firstByteOnSector(pNew, pNew->jump.iStart);
+ if( iRound>pNew->iOff ) pNew->jump.iStart = iRound;
+ pNew->jump.iEnd = lastByteOnSector(pNew, pNew->jump.iEnd);
+ }
+
+ assert( pDb->pLogWriter==pNew );
+ return rc;
+}
+
+/*
+** This function is called when a write-transaction is being closed.
+** Parameter bCommit is true if the transaction is being committed,
+** or false otherwise. The caller must hold the client-mutex to call
+** this function.
+**
+** A call to this function deletes the LogWriter object allocated by
+** lsmLogBegin(). If the transaction is being committed, the shared state
+** in *pLog is updated before returning.
+*/
+void lsmLogEnd(lsm_db *pDb, int bCommit){
+ DbLog *pLog;
+ LogWriter *p;
+ p = pDb->pLogWriter;
+
+ if( p==0 ) return;
+ pLog = &pDb->treehdr.log;
+
+ if( bCommit ){
+ pLog->aRegion[2].iEnd = p->iOff;
+ pLog->cksum0 = p->cksum0;
+ pLog->cksum1 = p->cksum1;
+ if( p->iRegion1End ){
+ /* This happens when the transaction had to jump over some other
+ ** part of the log. */
+ assert( pLog->aRegion[1].iEnd==0 );
+ assert( pLog->aRegion[2].iStartiRegion1End );
+ pLog->aRegion[1].iStart = pLog->aRegion[2].iStart;
+ pLog->aRegion[1].iEnd = p->iRegion1End;
+ pLog->aRegion[2].iStart = p->iRegion2Start;
+ }
+ }
+}
+
+static int jumpIfRequired(
+ lsm_db *pDb,
+ LogWriter *pLog,
+ int nReq,
+ int *pbJump
+){
+ /* Determine if it is necessary to add an LSM_LOG_JUMP to jump over the
+ ** jump region before writing the LSM_LOG_WRITE or DELETE record. This
+ ** is necessary if there is insufficient room between the current offset
+ ** and the jump region to fit the new WRITE/DELETE record and the largest
+ ** possible JUMP record with up to 7 bytes of padding (a total of 17
+ ** bytes). */
+ if( (pLog->jump.iStart > (pLog->iOff + pLog->buf.n))
+ && (pLog->jump.iStart < (pLog->iOff + pLog->buf.n + (nReq + 17)))
+ ){
+ int rc; /* Return code */
+ i64 iJump; /* Offset to jump to */
+ u8 aJump[10]; /* Encoded jump record */
+ int nJump; /* Valid bytes in aJump[] */
+ int nPad; /* Bytes of padding required */
+
+ /* Serialize the JUMP record */
+ iJump = pLog->jump.iEnd+1;
+ aJump[0] = LSM_LOG_JUMP;
+ nJump = 1 + lsmVarintPut64(&aJump[1], iJump);
+
+ /* Adding padding to the contents of the buffer so that it will be a
+ ** multiple of 8 bytes in size after the JUMP record is appended. This
+ ** is not strictly required, it just makes the keeping the running
+ ** checksum up to date in this file a little simpler. */
+ nPad = (pLog->buf.n + nJump) % 8;
+ if( nPad ){
+ u8 aPad[7] = {0,0,0,0,0,0,0};
+ nPad = 8-nPad;
+ if( nPad==1 ){
+ aPad[0] = LSM_LOG_PAD1;
+ }else{
+ aPad[0] = LSM_LOG_PAD2;
+ aPad[1] = (u8)(nPad-2);
+ }
+ rc = lsmStringBinAppend(&pLog->buf, aPad, nPad);
+ if( rc!=LSM_OK ) return rc;
+ }
+
+ /* Append the JUMP record to the buffer. Then flush the buffer to disk
+ ** and update the checksums. The next write to the log file (assuming
+ ** there is no transaction rollback) will be to offset iJump (just past
+ ** the jump region). */
+ rc = lsmStringBinAppend(&pLog->buf, aJump, nJump);
+ if( rc!=LSM_OK ) return rc;
+ assert( (pLog->buf.n % 8)==0 );
+ rc = lsmFsWriteLog(pDb->pFS, pLog->iOff, &pLog->buf);
+ if( rc!=LSM_OK ) return rc;
+ logUpdateCksum(pLog, pLog->buf.n);
+ pLog->iRegion1End = (pLog->iOff + pLog->buf.n);
+ pLog->iRegion2Start = iJump;
+ pLog->iOff = iJump;
+ pLog->iCksumBuf = pLog->buf.n = 0;
+ if( pbJump ) *pbJump = 1;
+ }
+
+ return LSM_OK;
+}
+
+static int logCksumAndFlush(lsm_db *pDb){
+ int rc; /* Return code */
+ LogWriter *pLog = pDb->pLogWriter;
+
+ /* Calculate the checksum value. Append it to the buffer. */
+ logUpdateCksum(pLog, pLog->buf.n);
+ lsmPutU32((u8 *)&pLog->buf.z[pLog->buf.n], pLog->cksum0);
+ pLog->buf.n += 4;
+ lsmPutU32((u8 *)&pLog->buf.z[pLog->buf.n], pLog->cksum1);
+ pLog->buf.n += 4;
+
+ /* Write the contents of the buffer to disk. */
+ rc = lsmFsWriteLog(pDb->pFS, pLog->iOff, &pLog->buf);
+ pLog->iOff += pLog->buf.n;
+ pLog->iCksumBuf = pLog->buf.n = 0;
+
+ return rc;
+}
+
+/*
+** Write the contents of the log-buffer to disk. Then write either a CKSUM
+** or COMMIT record, depending on the value of parameter eType.
+*/
+static int logFlush(lsm_db *pDb, int eType){
+ int rc;
+ int nReq;
+ LogWriter *pLog = pDb->pLogWriter;
+
+ assert( eType==LSM_LOG_COMMIT );
+ assert( pLog );
+
+ /* Commit record is always 9 bytes in size. */
+ nReq = 9;
+ if( eType==LSM_LOG_COMMIT && pLog->szSector>1 ) nReq += pLog->szSector + 17;
+ rc = jumpIfRequired(pDb, pLog, nReq, 0);
+
+ /* If this is a COMMIT, add padding to the log so that the COMMIT record
+ ** is aligned against the end of a disk sector. In other words, add padding
+ ** so that the first byte following the COMMIT record lies on a different
+ ** sector. */
+ if( eType==LSM_LOG_COMMIT && pLog->szSector>1 ){
+ int nPad; /* Bytes of padding to add */
+
+ /* Determine the value of nPad. */
+ nPad = ((pLog->iOff + pLog->buf.n + 9) % pLog->szSector);
+ if( nPad ) nPad = pLog->szSector - nPad;
+ rc = lsmStringExtend(&pLog->buf, nPad);
+ if( rc!=LSM_OK ) return rc;
+
+ while( nPad ){
+ if( nPad==1 ){
+ pLog->buf.z[pLog->buf.n++] = LSM_LOG_PAD1;
+ nPad = 0;
+ }else{
+ int n = LSM_MIN(200, nPad-2);
+ pLog->buf.z[pLog->buf.n++] = LSM_LOG_PAD2;
+ pLog->buf.z[pLog->buf.n++] = (char)n;
+ nPad -= 2;
+ memset(&pLog->buf.z[pLog->buf.n], 0x2B, n);
+ pLog->buf.n += n;
+ nPad -= n;
+ }
+ }
+ }
+
+ /* Make sure there is room in the log-buffer to add the CKSUM or COMMIT
+ ** record. Then add the first byte of it. */
+ rc = lsmStringExtend(&pLog->buf, 9);
+ if( rc!=LSM_OK ) return rc;
+ pLog->buf.z[pLog->buf.n++] = (char)eType;
+ memset(&pLog->buf.z[pLog->buf.n], 0, 8);
+
+ rc = logCksumAndFlush(pDb);
+
+ /* If this is a commit and synchronous=full, sync the log to disk. */
+ if( rc==LSM_OK && eType==LSM_LOG_COMMIT && pDb->eSafety==LSM_SAFETY_FULL ){
+ rc = lsmFsSyncLog(pDb->pFS);
+ }
+ return rc;
+}
+
+/*
+** Append an LSM_LOG_WRITE (if nVal>=0) or LSM_LOG_DELETE (if nVal<0)
+** record to the database log.
+*/
+int lsmLogWrite(
+ lsm_db *pDb, /* Database handle */
+ int eType,
+ void *pKey, int nKey, /* Database key to write to log */
+ void *pVal, int nVal /* Database value (or nVal<0) to write */
+){
+ int rc = LSM_OK;
+ LogWriter *pLog; /* Log object to write to */
+ int nReq; /* Bytes of space required in log */
+ int bCksum = 0; /* True to embed a checksum in this record */
+
+ assert( eType==LSM_WRITE || eType==LSM_DELETE || eType==LSM_DRANGE );
+ assert( LSM_LOG_WRITE==LSM_WRITE );
+ assert( LSM_LOG_DELETE==LSM_DELETE );
+ assert( LSM_LOG_DRANGE==LSM_DRANGE );
+ assert( (eType==LSM_LOG_DELETE)==(nVal<0) );
+
+ if( pDb->bUseLog==0 ) return LSM_OK;
+ pLog = pDb->pLogWriter;
+
+ /* Determine how many bytes of space are required, assuming that a checksum
+ ** will be embedded in this record (even though it may not be). */
+ nReq = 1 + lsmVarintLen32(nKey) + 8 + nKey;
+ if( eType!=LSM_LOG_DELETE ) nReq += lsmVarintLen32(nVal) + nVal;
+
+ /* Jump over the jump region if required. Set bCksum to true to tell the
+ ** code below to include a checksum in the record if either (a) writing
+ ** this record would mean that more than LSM_CKSUM_MAXDATA bytes of data
+ ** have been written to the log since the last checksum, or (b) the jump
+ ** is taken. */
+ rc = jumpIfRequired(pDb, pLog, nReq, &bCksum);
+ if( (pLog->buf.n+nReq) > LSM_CKSUM_MAXDATA ) bCksum = 1;
+
+ if( rc==LSM_OK ){
+ rc = lsmStringExtend(&pLog->buf, nReq);
+ }
+ if( rc==LSM_OK ){
+ u8 *a = (u8 *)&pLog->buf.z[pLog->buf.n];
+
+ /* Write the record header - the type byte followed by either 1 (for
+ ** DELETE) or 2 (for WRITE) varints. */
+ assert( LSM_LOG_WRITE_CKSUM == (LSM_LOG_WRITE | 0x0001) );
+ assert( LSM_LOG_DELETE_CKSUM == (LSM_LOG_DELETE | 0x0001) );
+ assert( LSM_LOG_DRANGE_CKSUM == (LSM_LOG_DRANGE | 0x0001) );
+ *(a++) = (u8)eType | (u8)bCksum;
+ a += lsmVarintPut32(a, nKey);
+ if( eType!=LSM_LOG_DELETE ) a += lsmVarintPut32(a, nVal);
+
+ if( bCksum ){
+ pLog->buf.n = (a - (u8 *)pLog->buf.z);
+ rc = logCksumAndFlush(pDb);
+ a = (u8 *)&pLog->buf.z[pLog->buf.n];
+ }
+
+ memcpy(a, pKey, nKey);
+ a += nKey;
+ if( eType!=LSM_LOG_DELETE ){
+ memcpy(a, pVal, nVal);
+ a += nVal;
+ }
+ pLog->buf.n = a - (u8 *)pLog->buf.z;
+ assert( pLog->buf.n<=pLog->buf.nAlloc );
+ }
+
+ return rc;
+}
+
+/*
+** Append an LSM_LOG_COMMIT record to the database log.
+*/
+int lsmLogCommit(lsm_db *pDb){
+ if( pDb->bUseLog==0 ) return LSM_OK;
+ return logFlush(pDb, LSM_LOG_COMMIT);
+}
+
+/*
+** Store the current offset and other checksum related information in the
+** structure *pMark. Later, *pMark can be passed to lsmLogSeek() to "rewind"
+** the LogWriter object to the current log file offset. This is used when
+** rolling back savepoint transactions.
+*/
+void lsmLogTell(
+ lsm_db *pDb, /* Database handle */
+ LogMark *pMark /* Populate this object with current offset */
+){
+ LogWriter *pLog;
+ int nCksum;
+
+ if( pDb->bUseLog==0 ) return;
+ pLog = pDb->pLogWriter;
+ nCksum = pLog->buf.n & 0xFFFFFFF8;
+ logUpdateCksum(pLog, nCksum);
+ assert( pLog->iCksumBuf==nCksum );
+ pMark->nBuf = pLog->buf.n - nCksum;
+ memcpy(pMark->aBuf, &pLog->buf.z[nCksum], pMark->nBuf);
+
+ pMark->iOff = pLog->iOff + pLog->buf.n;
+ pMark->cksum0 = pLog->cksum0;
+ pMark->cksum1 = pLog->cksum1;
+}
+
+/*
+** Seek (rewind) back to the log file offset stored by an ealier call to
+** lsmLogTell() in *pMark.
+*/
+void lsmLogSeek(
+ lsm_db *pDb, /* Database handle */
+ LogMark *pMark /* Object containing log offset to seek to */
+){
+ LogWriter *pLog;
+
+ if( pDb->bUseLog==0 ) return;
+ pLog = pDb->pLogWriter;
+
+ assert( pMark->iOff<=pLog->iOff+pLog->buf.n );
+ if( (pMark->iOff & 0xFFFFFFF8)>=pLog->iOff ){
+ pLog->buf.n = (int)(pMark->iOff - pLog->iOff);
+ pLog->iCksumBuf = (pLog->buf.n & 0xFFFFFFF8);
+ }else{
+ pLog->buf.n = pMark->nBuf;
+ memcpy(pLog->buf.z, pMark->aBuf, pMark->nBuf);
+ pLog->iCksumBuf = 0;
+ pLog->iOff = pMark->iOff - pMark->nBuf;
+ }
+ pLog->cksum0 = pMark->cksum0;
+ pLog->cksum1 = pMark->cksum1;
+
+ if( pMark->iOff > pLog->iRegion1End ) pLog->iRegion1End = 0;
+ if( pMark->iOff > pLog->iRegion2Start ) pLog->iRegion2Start = 0;
+}
+
+/*
+** This function does the work for an lsm_info(LOG_STRUCTURE) request.
+*/
+int lsmInfoLogStructure(lsm_db *pDb, char **pzVal){
+ int rc = LSM_OK;
+ char *zVal = 0;
+
+ /* If there is no read or write transaction open, read the latest
+ ** tree-header from shared-memory to report on. If necessary, update
+ ** it based on the contents of the database header.
+ **
+ ** No locks are taken here - these are passive read operations only.
+ */
+ if( pDb->pCsr==0 && pDb->nTransOpen==0 ){
+ rc = lsmTreeLoadHeader(pDb, 0);
+ if( rc==LSM_OK ) rc = logReclaimSpace(pDb);
+ }
+
+ if( rc==LSM_OK ){
+ DbLog *pLog = &pDb->treehdr.log;
+ zVal = lsmMallocPrintf(pDb->pEnv,
+ "%d %d %d %d %d %d",
+ (int)pLog->aRegion[0].iStart, (int)pLog->aRegion[0].iEnd,
+ (int)pLog->aRegion[1].iStart, (int)pLog->aRegion[1].iEnd,
+ (int)pLog->aRegion[2].iStart, (int)pLog->aRegion[2].iEnd
+ );
+ if( !zVal ) rc = LSM_NOMEM_BKPT;
+ }
+
+ *pzVal = zVal;
+ return rc;
+}
+
+/*************************************************************************
+** Begin code for log recovery.
+*/
+
+typedef struct LogReader LogReader;
+struct LogReader {
+ FileSystem *pFS; /* File system to read from */
+ i64 iOff; /* File offset at end of buf content */
+ int iBuf; /* Current read offset in buf */
+ LsmString buf; /* Buffer containing file content */
+
+ int iCksumBuf; /* Offset in buf corresponding to cksum[01] */
+ u32 cksum0; /* Checksum 0 at offset iCksumBuf */
+ u32 cksum1; /* Checksum 1 at offset iCksumBuf */
+};
+
+static void logReaderBlob(
+ LogReader *p, /* Log reader object */
+ LsmString *pBuf, /* Dynamic storage, if required */
+ int nBlob, /* Number of bytes to read */
+ u8 **ppBlob, /* OUT: Pointer to blob read */
+ int *pRc /* IN/OUT: Error code */
+){
+ static const int LOG_READ_SIZE = 512;
+ int rc = *pRc; /* Return code */
+ int nReq = nBlob; /* Bytes required */
+
+ while( rc==LSM_OK && nReq>0 ){
+ int nAvail; /* Bytes of data available in p->buf */
+ if( p->buf.n==p->iBuf ){
+ int nCksum; /* Total bytes requiring checksum */
+ int nCarry = 0; /* Total bytes requiring checksum */
+
+ nCksum = p->iBuf - p->iCksumBuf;
+ if( nCksum>0 ){
+ nCarry = nCksum % 8;
+ nCksum = ((nCksum / 8) * 8);
+ if( nCksum>0 ){
+ logCksumUnaligned(
+ &p->buf.z[p->iCksumBuf], nCksum, &p->cksum0, &p->cksum1
+ );
+ }
+ }
+ if( nCarry>0 ) memcpy(p->buf.z, &p->buf.z[p->iBuf-nCarry], nCarry);
+ p->buf.n = nCarry;
+ p->iBuf = nCarry;
+
+ rc = lsmFsReadLog(p->pFS, p->iOff, LOG_READ_SIZE, &p->buf);
+ if( rc!=LSM_OK ) break;
+ p->iCksumBuf = 0;
+ p->iOff += LOG_READ_SIZE;
+ }
+
+ nAvail = p->buf.n - p->iBuf;
+ if( ppBlob && nReq==nBlob && nBlob<=nAvail ){
+ *ppBlob = (u8 *)&p->buf.z[p->iBuf];
+ p->iBuf += nBlob;
+ nReq = 0;
+ }else{
+ int nCopy = LSM_MIN(nAvail, nReq);
+ if( nBlob==nReq ){
+ pBuf->n = 0;
+ }
+ rc = lsmStringBinAppend(pBuf, (u8 *)&p->buf.z[p->iBuf], nCopy);
+ nReq -= nCopy;
+ p->iBuf += nCopy;
+ if( nReq==0 && ppBlob ){
+ *ppBlob = (u8*)pBuf->z;
+ }
+ }
+ }
+
+ *pRc = rc;
+}
+
+static void logReaderVarint(
+ LogReader *p,
+ LsmString *pBuf,
+ int *piVal, /* OUT: Value read from log */
+ int *pRc /* IN/OUT: Error code */
+){
+ if( *pRc==LSM_OK ){
+ u8 *aVarint;
+ if( p->buf.n==p->iBuf ){
+ logReaderBlob(p, 0, 10, &aVarint, pRc);
+ if( LSM_OK==*pRc ) p->iBuf -= (10 - lsmVarintGet32(aVarint, piVal));
+ }else{
+ logReaderBlob(p, pBuf, lsmVarintSize(p->buf.z[p->iBuf]), &aVarint, pRc);
+ if( LSM_OK==*pRc ) lsmVarintGet32(aVarint, piVal);
+ }
+ }
+}
+
+static void logReaderByte(LogReader *p, u8 *pByte, int *pRc){
+ u8 *pPtr = 0;
+ logReaderBlob(p, 0, 1, &pPtr, pRc);
+ if( pPtr ) *pByte = *pPtr;
+}
+
+static void logReaderCksum(LogReader *p, LsmString *pBuf, int *pbEof, int *pRc){
+ if( *pRc==LSM_OK ){
+ u8 *pPtr = 0;
+ u32 cksum0, cksum1;
+ int nCksum = p->iBuf - p->iCksumBuf;
+
+ /* Update in-memory (expected) checksums */
+ assert( nCksum>=0 );
+ logCksumUnaligned(&p->buf.z[p->iCksumBuf], nCksum, &p->cksum0, &p->cksum1);
+ p->iCksumBuf = p->iBuf + 8;
+ logReaderBlob(p, pBuf, 8, &pPtr, pRc);
+ assert( pPtr || *pRc );
+
+ /* Read the checksums from the log file. Set *pbEof if they do not match. */
+ if( pPtr ){
+ cksum0 = lsmGetU32(pPtr);
+ cksum1 = lsmGetU32(&pPtr[4]);
+ *pbEof = (cksum0!=p->cksum0 || cksum1!=p->cksum1);
+ p->iCksumBuf = p->iBuf;
+ }
+ }
+}
+
+static void logReaderInit(
+ lsm_db *pDb, /* Database handle */
+ DbLog *pLog, /* Log object associated with pDb */
+ int bInitBuf, /* True if p->buf is uninitialized */
+ LogReader *p /* Initialize this LogReader object */
+){
+ p->pFS = pDb->pFS;
+ p->iOff = pLog->aRegion[2].iStart;
+ p->cksum0 = pLog->cksum0;
+ p->cksum1 = pLog->cksum1;
+ if( bInitBuf ){ lsmStringInit(&p->buf, pDb->pEnv); }
+ p->buf.n = 0;
+ p->iCksumBuf = 0;
+ p->iBuf = 0;
+}
+
+/*
+** This function is called after reading the header of a LOG_DELETE or
+** LOG_WRITE record. Parameter nByte is the total size of the key and
+** value that follow the header just read. Return true if the size and
+** position of the record indicate that it should contain a checksum.
+*/
+static int logRequireCksum(LogReader *p, int nByte){
+ return ((p->iBuf + nByte - p->iCksumBuf) > LSM_CKSUM_MAXDATA);
+}
+
+/*
+** Recover the contents of the log file.
+*/
+int lsmLogRecover(lsm_db *pDb){
+ LsmString buf1; /* Key buffer */
+ LsmString buf2; /* Value buffer */
+ LogReader reader; /* Log reader object */
+ int rc = LSM_OK; /* Return code */
+ int nCommit = 0; /* Number of transactions to recover */
+ int iPass;
+ int nJump = 0; /* Number of LSM_LOG_JUMP records in pass 0 */
+ DbLog *pLog;
+ int bOpen;
+
+ rc = lsmFsOpenLog(pDb, &bOpen);
+ if( rc!=LSM_OK ) return rc;
+
+ rc = lsmTreeInit(pDb);
+ if( rc!=LSM_OK ) return rc;
+
+ pLog = &pDb->treehdr.log;
+ lsmCheckpointLogoffset(pDb->pShmhdr->aSnap2, pLog);
+
+ logReaderInit(pDb, pLog, 1, &reader);
+ lsmStringInit(&buf1, pDb->pEnv);
+ lsmStringInit(&buf2, pDb->pEnv);
+
+ /* The outer for() loop runs at most twice. The first iteration is to
+ ** count the number of committed transactions in the log. The second
+ ** iterates through those transactions and updates the in-memory tree
+ ** structure with their contents. */
+ if( bOpen ){
+ for(iPass=0; iPass<2 && rc==LSM_OK; iPass++){
+ int bEof = 0;
+
+ while( rc==LSM_OK && !bEof ){
+ u8 eType = 0;
+ logReaderByte(&reader, &eType, &rc);
+
+ switch( eType ){
+ case LSM_LOG_PAD1:
+ break;
+
+ case LSM_LOG_PAD2: {
+ int nPad;
+ logReaderVarint(&reader, &buf1, &nPad, &rc);
+ logReaderBlob(&reader, &buf1, nPad, 0, &rc);
+ break;
+ }
+
+ case LSM_LOG_DRANGE:
+ case LSM_LOG_DRANGE_CKSUM:
+ case LSM_LOG_WRITE:
+ case LSM_LOG_WRITE_CKSUM: {
+ int nKey;
+ int nVal;
+ u8 *aVal;
+ logReaderVarint(&reader, &buf1, &nKey, &rc);
+ logReaderVarint(&reader, &buf2, &nVal, &rc);
+
+ if( eType==LSM_LOG_WRITE_CKSUM || eType==LSM_LOG_DRANGE_CKSUM ){
+ logReaderCksum(&reader, &buf1, &bEof, &rc);
+ }else{
+ bEof = logRequireCksum(&reader, nKey+nVal);
+ }
+ if( bEof ) break;
+
+ logReaderBlob(&reader, &buf1, nKey, 0, &rc);
+ logReaderBlob(&reader, &buf2, nVal, &aVal, &rc);
+ if( iPass==1 && rc==LSM_OK ){
+ if( eType==LSM_LOG_WRITE || eType==LSM_LOG_WRITE_CKSUM ){
+ rc = lsmTreeInsert(pDb, (u8 *)buf1.z, nKey, aVal, nVal);
+ }else{
+ rc = lsmTreeDelete(pDb, (u8 *)buf1.z, nKey, aVal, nVal);
+ }
+ }
+ break;
+ }
+
+ case LSM_LOG_DELETE:
+ case LSM_LOG_DELETE_CKSUM: {
+ int nKey; u8 *aKey;
+ logReaderVarint(&reader, &buf1, &nKey, &rc);
+
+ if( eType==LSM_LOG_DELETE_CKSUM ){
+ logReaderCksum(&reader, &buf1, &bEof, &rc);
+ }else{
+ bEof = logRequireCksum(&reader, nKey);
+ }
+ if( bEof ) break;
+
+ logReaderBlob(&reader, &buf1, nKey, &aKey, &rc);
+ if( iPass==1 && rc==LSM_OK ){
+ rc = lsmTreeInsert(pDb, aKey, nKey, NULL, -1);
+ }
+ break;
+ }
+
+ case LSM_LOG_COMMIT:
+ logReaderCksum(&reader, &buf1, &bEof, &rc);
+ if( bEof==0 ){
+ nCommit++;
+ assert( nCommit>0 || iPass==1 );
+ if( nCommit==0 ) bEof = 1;
+ }
+ break;
+
+ case LSM_LOG_JUMP: {
+ int iOff = 0;
+ logReaderVarint(&reader, &buf1, &iOff, &rc);
+ if( rc==LSM_OK ){
+ if( iPass==1 ){
+ if( pLog->aRegion[2].iStart==0 ){
+ assert( pLog->aRegion[1].iStart==0 );
+ pLog->aRegion[1].iEnd = reader.iOff;
+ }else{
+ assert( pLog->aRegion[0].iStart==0 );
+ pLog->aRegion[0].iStart = pLog->aRegion[2].iStart;
+ pLog->aRegion[0].iEnd = reader.iOff-reader.buf.n+reader.iBuf;
+ }
+ pLog->aRegion[2].iStart = iOff;
+ }else{
+ if( (nJump++)==2 ){
+ bEof = 1;
+ }
+ }
+
+ reader.iOff = iOff;
+ reader.buf.n = reader.iBuf;
+ }
+ break;
+ }
+
+ default:
+ /* Including LSM_LOG_EOF */
+ bEof = 1;
+ break;
+ }
+ }
+
+ if( rc==LSM_OK && iPass==0 ){
+ if( nCommit==0 ){
+ if( pLog->aRegion[2].iStart==0 ){
+ iPass = 1;
+ }else{
+ pLog->aRegion[2].iStart = 0;
+ iPass = -1;
+ lsmCheckpointZeroLogoffset(pDb);
+ }
+ }
+ logReaderInit(pDb, pLog, 0, &reader);
+ nCommit = nCommit * -1;
+ }
+ }
+ }
+
+ /* Initialize DbLog object */
+ if( rc==LSM_OK ){
+ pLog->aRegion[2].iEnd = reader.iOff - reader.buf.n + reader.iBuf;
+ pLog->cksum0 = reader.cksum0;
+ pLog->cksum1 = reader.cksum1;
+ }
+
+ if( rc==LSM_OK ){
+ rc = lsmFinishRecovery(pDb);
+ }else{
+ lsmFinishRecovery(pDb);
+ }
+
+ if( pDb->bRoTrans ){
+ lsmFsCloseLog(pDb);
+ }
+
+ lsmStringClear(&buf1);
+ lsmStringClear(&buf2);
+ lsmStringClear(&reader.buf);
+ return rc;
+}
+
+void lsmLogClose(lsm_db *db){
+ if( db->pLogWriter ){
+ lsmFree(db->pEnv, db->pLogWriter->buf.z);
+ lsmFree(db->pEnv, db->pLogWriter);
+ db->pLogWriter = 0;
+ }
+}
diff --git a/ext/lsm1/lsm_main.c b/ext/lsm1/lsm_main.c
new file mode 100644
index 0000000..8a324a3
--- /dev/null
+++ b/ext/lsm1/lsm_main.c
@@ -0,0 +1,1008 @@
+/*
+** 2011-08-18
+**
+** The author disclaims copyright to this source code. In place of
+** a legal notice, here is a blessing:
+**
+** May you do good and not evil.
+** May you find forgiveness for yourself and forgive others.
+** May you share freely, never taking more than you give.
+**
+*************************************************************************
+**
+** The main interface to the LSM module.
+*/
+#include "lsmInt.h"
+
+
+#ifdef LSM_DEBUG
+/*
+** This function returns a copy of its only argument.
+**
+** When the library is built with LSM_DEBUG defined, this function is called
+** whenever an error code is generated (not propagated - generated). So
+** if the library is mysteriously returning (say) LSM_IOERR, a breakpoint
+** may be set in this function to determine why.
+*/
+int lsmErrorBkpt(int rc){
+ /* Set breakpoint here! */
+ return rc;
+}
+
+/*
+** This function contains various assert() statements that test that the
+** lsm_db structure passed as an argument is internally consistent.
+*/
+static void assert_db_state(lsm_db *pDb){
+
+ /* If there is at least one cursor or a write transaction open, the database
+ ** handle must be holding a pointer to a client snapshot. And the reverse
+ ** - if there are no open cursors and no write transactions then there must
+ ** not be a client snapshot. */
+
+ assert( (pDb->pCsr!=0||pDb->nTransOpen>0)==(pDb->iReader>=0||pDb->bRoTrans) );
+
+ assert( (pDb->iReader<0 && pDb->bRoTrans==0) || pDb->pClient!=0 );
+
+ assert( pDb->nTransOpen>=0 );
+}
+#else
+# define assert_db_state(x)
+#endif
+
+/*
+** The default key-compare function.
+*/
+static int xCmp(void *p1, int n1, void *p2, int n2){
+ int res;
+ res = memcmp(p1, p2, LSM_MIN(n1, n2));
+ if( res==0 ) res = (n1-n2);
+ return res;
+}
+
+static void xLog(void *pCtx, int rc, const char *z){
+ (void)(rc);
+ (void)(pCtx);
+ fprintf(stderr, "%s\n", z);
+ fflush(stderr);
+}
+
+/*
+** Allocate a new db handle.
+*/
+int lsm_new(lsm_env *pEnv, lsm_db **ppDb){
+ lsm_db *pDb;
+
+ /* If the user did not provide an environment, use the default. */
+ if( pEnv==0 ) pEnv = lsm_default_env();
+ assert( pEnv );
+
+ /* Allocate the new database handle */
+ *ppDb = pDb = (lsm_db *)lsmMallocZero(pEnv, sizeof(lsm_db));
+ if( pDb==0 ) return LSM_NOMEM_BKPT;
+
+ /* Initialize the new object */
+ pDb->pEnv = pEnv;
+ pDb->nTreeLimit = LSM_DFLT_AUTOFLUSH;
+ pDb->nAutockpt = LSM_DFLT_AUTOCHECKPOINT;
+ pDb->bAutowork = LSM_DFLT_AUTOWORK;
+ pDb->eSafety = LSM_DFLT_SAFETY;
+ pDb->xCmp = xCmp;
+ pDb->nDfltPgsz = LSM_DFLT_PAGE_SIZE;
+ pDb->nDfltBlksz = LSM_DFLT_BLOCK_SIZE;
+ pDb->nMerge = LSM_DFLT_AUTOMERGE;
+ pDb->nMaxFreelist = LSM_MAX_FREELIST_ENTRIES;
+ pDb->bUseLog = LSM_DFLT_USE_LOG;
+ pDb->iReader = -1;
+ pDb->iRwclient = -1;
+ pDb->bMultiProc = LSM_DFLT_MULTIPLE_PROCESSES;
+ pDb->iMmap = LSM_DFLT_MMAP;
+ pDb->xLog = xLog;
+ pDb->compress.iId = LSM_COMPRESSION_NONE;
+ return LSM_OK;
+}
+
+lsm_env *lsm_get_env(lsm_db *pDb){
+ assert( pDb->pEnv );
+ return pDb->pEnv;
+}
+
+/*
+** If database handle pDb is currently holding a client snapshot, but does
+** not have any open cursors or write transactions, release it.
+*/
+static void dbReleaseClientSnapshot(lsm_db *pDb){
+ if( pDb->nTransOpen==0 && pDb->pCsr==0 ){
+ lsmFinishReadTrans(pDb);
+ }
+}
+
+static int getFullpathname(
+ lsm_env *pEnv,
+ const char *zRel,
+ char **pzAbs
+){
+ int nAlloc = 0;
+ char *zAlloc = 0;
+ int nReq = 0;
+ int rc;
+
+ do{
+ nAlloc = nReq;
+ rc = pEnv->xFullpath(pEnv, zRel, zAlloc, &nReq);
+ if( nReq>nAlloc ){
+ zAlloc = lsmReallocOrFreeRc(pEnv, zAlloc, nReq, &rc);
+ }
+ }while( nReq>nAlloc && rc==LSM_OK );
+
+ if( rc!=LSM_OK ){
+ lsmFree(pEnv, zAlloc);
+ zAlloc = 0;
+ }
+ *pzAbs = zAlloc;
+ return rc;
+}
+
+/*
+** Check that the bits in the db->mLock mask are consistent with the
+** value stored in db->iRwclient. An assert shall fail otherwise.
+*/
+static void assertRwclientLockValue(lsm_db *db){
+#ifndef NDEBUG
+ u64 msk; /* Mask of mLock bits for RWCLIENT locks */
+ u64 rwclient = 0; /* Bit corresponding to db->iRwclient */
+
+ if( db->iRwclient>=0 ){
+ rwclient = ((u64)1 << (LSM_LOCK_RWCLIENT(db->iRwclient)-1));
+ }
+ msk = ((u64)1 << (LSM_LOCK_RWCLIENT(LSM_LOCK_NRWCLIENT)-1)) - 1;
+ msk -= (((u64)1 << (LSM_LOCK_RWCLIENT(0)-1)) - 1);
+
+ assert( (db->mLock & msk)==rwclient );
+#endif
+}
+
+/*
+** Open a new connection to database zFilename.
+*/
+int lsm_open(lsm_db *pDb, const char *zFilename){
+ int rc;
+
+ if( pDb->pDatabase ){
+ rc = LSM_MISUSE;
+ }else{
+ char *zFull;
+
+ /* Translate the possibly relative pathname supplied by the user into
+ ** an absolute pathname. This is required because the supplied path
+ ** is used (either directly or with "-log" appended to it) for more
+ ** than one purpose - to open both the database and log files, and
+ ** perhaps to unlink the log file during disconnection. An absolute
+ ** path is required to ensure that the correct files are operated
+ ** on even if the application changes the cwd. */
+ rc = getFullpathname(pDb->pEnv, zFilename, &zFull);
+ assert( rc==LSM_OK || zFull==0 );
+
+ /* Connect to the database. */
+ if( rc==LSM_OK ){
+ rc = lsmDbDatabaseConnect(pDb, zFull);
+ }
+
+ if( pDb->bReadonly==0 ){
+ /* Configure the file-system connection with the page-size and block-size
+ ** of this database. Even if the database file is zero bytes in size
+ ** on disk, these values have been set in shared-memory by now, and so
+ ** are guaranteed not to change during the lifetime of this connection.
+ */
+ if( rc==LSM_OK && LSM_OK==(rc = lsmCheckpointLoad(pDb, 0)) ){
+ lsmFsSetPageSize(pDb->pFS, lsmCheckpointPgsz(pDb->aSnapshot));
+ lsmFsSetBlockSize(pDb->pFS, lsmCheckpointBlksz(pDb->aSnapshot));
+ }
+ }
+
+ lsmFree(pDb->pEnv, zFull);
+ assertRwclientLockValue(pDb);
+ }
+
+ assert( pDb->bReadonly==0 || pDb->bReadonly==1 );
+ assert( rc!=LSM_OK || (pDb->pShmhdr==0)==(pDb->bReadonly==1) );
+
+ return rc;
+}
+
+int lsm_close(lsm_db *pDb){
+ int rc = LSM_OK;
+ if( pDb ){
+ assert_db_state(pDb);
+ if( pDb->pCsr || pDb->nTransOpen ){
+ rc = LSM_MISUSE_BKPT;
+ }else{
+ lsmMCursorFreeCache(pDb);
+ lsmFreeSnapshot(pDb->pEnv, pDb->pClient);
+ pDb->pClient = 0;
+
+ assertRwclientLockValue(pDb);
+
+ lsmDbDatabaseRelease(pDb);
+ lsmLogClose(pDb);
+ lsmFsClose(pDb->pFS);
+ /* assert( pDb->mLock==0 ); */
+
+ /* Invoke any destructors registered for the compression or
+ ** compression factory callbacks. */
+ if( pDb->factory.xFree ) pDb->factory.xFree(pDb->factory.pCtx);
+ if( pDb->compress.xFree ) pDb->compress.xFree(pDb->compress.pCtx);
+
+ lsmFree(pDb->pEnv, pDb->rollback.aArray);
+ lsmFree(pDb->pEnv, pDb->aTrans);
+ lsmFree(pDb->pEnv, pDb->apShm);
+ lsmFree(pDb->pEnv, pDb);
+ }
+ }
+ return rc;
+}
+
+int lsm_config(lsm_db *pDb, int eParam, ...){
+ int rc = LSM_OK;
+ va_list ap;
+ va_start(ap, eParam);
+
+ switch( eParam ){
+ case LSM_CONFIG_AUTOFLUSH: {
+ /* This parameter is read and written in KB. But all internal
+ ** processing is done in bytes. */
+ int *piVal = va_arg(ap, int *);
+ int iVal = *piVal;
+ if( iVal>=0 && iVal<=(1024*1024) ){
+ pDb->nTreeLimit = iVal*1024;
+ }
+ *piVal = (pDb->nTreeLimit / 1024);
+ break;
+ }
+
+ case LSM_CONFIG_AUTOWORK: {
+ int *piVal = va_arg(ap, int *);
+ if( *piVal>=0 ){
+ pDb->bAutowork = *piVal;
+ }
+ *piVal = pDb->bAutowork;
+ break;
+ }
+
+ case LSM_CONFIG_AUTOCHECKPOINT: {
+ /* This parameter is read and written in KB. But all internal processing
+ ** (including the lsm_db.nAutockpt variable) is done in bytes. */
+ int *piVal = va_arg(ap, int *);
+ if( *piVal>=0 ){
+ int iVal = *piVal;
+ pDb->nAutockpt = (i64)iVal * 1024;
+ }
+ *piVal = (int)(pDb->nAutockpt / 1024);
+ break;
+ }
+
+ case LSM_CONFIG_PAGE_SIZE: {
+ int *piVal = va_arg(ap, int *);
+ if( pDb->pDatabase ){
+ /* If lsm_open() has been called, this is a read-only parameter.
+ ** Set the output variable to the page-size according to the
+ ** FileSystem object. */
+ *piVal = lsmFsPageSize(pDb->pFS);
+ }else{
+ if( *piVal>=256 && *piVal<=65536 && ((*piVal-1) & *piVal)==0 ){
+ pDb->nDfltPgsz = *piVal;
+ }else{
+ *piVal = pDb->nDfltPgsz;
+ }
+ }
+ break;
+ }
+
+ case LSM_CONFIG_BLOCK_SIZE: {
+ /* This parameter is read and written in KB. But all internal
+ ** processing is done in bytes. */
+ int *piVal = va_arg(ap, int *);
+ if( pDb->pDatabase ){
+ /* If lsm_open() has been called, this is a read-only parameter.
+ ** Set the output variable to the block-size in KB according to the
+ ** FileSystem object. */
+ *piVal = lsmFsBlockSize(pDb->pFS) / 1024;
+ }else{
+ int iVal = *piVal;
+ if( iVal>=64 && iVal<=65536 && ((iVal-1) & iVal)==0 ){
+ pDb->nDfltBlksz = iVal * 1024;
+ }else{
+ *piVal = pDb->nDfltBlksz / 1024;
+ }
+ }
+ break;
+ }
+
+ case LSM_CONFIG_SAFETY: {
+ int *piVal = va_arg(ap, int *);
+ if( *piVal>=0 && *piVal<=2 ){
+ pDb->eSafety = *piVal;
+ }
+ *piVal = pDb->eSafety;
+ break;
+ }
+
+ case LSM_CONFIG_MMAP: {
+ int *piVal = va_arg(ap, int *);
+ if( pDb->iReader<0 && *piVal>=0 ){
+ pDb->iMmap = *piVal;
+ rc = lsmFsConfigure(pDb);
+ }
+ *piVal = pDb->iMmap;
+ break;
+ }
+
+ case LSM_CONFIG_USE_LOG: {
+ int *piVal = va_arg(ap, int *);
+ if( pDb->nTransOpen==0 && (*piVal==0 || *piVal==1) ){
+ pDb->bUseLog = *piVal;
+ }
+ *piVal = pDb->bUseLog;
+ break;
+ }
+
+ case LSM_CONFIG_AUTOMERGE: {
+ int *piVal = va_arg(ap, int *);
+ if( *piVal>1 ) pDb->nMerge = *piVal;
+ *piVal = pDb->nMerge;
+ break;
+ }
+
+ case LSM_CONFIG_MAX_FREELIST: {
+ int *piVal = va_arg(ap, int *);
+ if( *piVal>=2 && *piVal<=LSM_MAX_FREELIST_ENTRIES ){
+ pDb->nMaxFreelist = *piVal;
+ }
+ *piVal = pDb->nMaxFreelist;
+ break;
+ }
+
+ case LSM_CONFIG_MULTIPLE_PROCESSES: {
+ int *piVal = va_arg(ap, int *);
+ if( pDb->pDatabase ){
+ /* If lsm_open() has been called, this is a read-only parameter.
+ ** Set the output variable to true if this connection is currently
+ ** in multi-process mode. */
+ *piVal = lsmDbMultiProc(pDb);
+ }else{
+ pDb->bMultiProc = *piVal = (*piVal!=0);
+ }
+ break;
+ }
+
+ case LSM_CONFIG_READONLY: {
+ int *piVal = va_arg(ap, int *);
+ /* If lsm_open() has been called, this is a read-only parameter. */
+ if( pDb->pDatabase==0 && *piVal>=0 ){
+ pDb->bReadonly = *piVal = (*piVal!=0);
+ }
+ *piVal = pDb->bReadonly;
+ break;
+ }
+
+ case LSM_CONFIG_SET_COMPRESSION: {
+ lsm_compress *p = va_arg(ap, lsm_compress *);
+ if( pDb->iReader>=0 && pDb->bInFactory==0 ){
+ /* May not change compression schemes with an open transaction */
+ rc = LSM_MISUSE_BKPT;
+ }else{
+ if( pDb->compress.xFree ){
+ /* Invoke any destructor belonging to the current compression. */
+ pDb->compress.xFree(pDb->compress.pCtx);
+ }
+ if( p->xBound==0 ){
+ memset(&pDb->compress, 0, sizeof(lsm_compress));
+ pDb->compress.iId = LSM_COMPRESSION_NONE;
+ }else{
+ memcpy(&pDb->compress, p, sizeof(lsm_compress));
+ }
+ rc = lsmFsConfigure(pDb);
+ }
+ break;
+ }
+
+ case LSM_CONFIG_SET_COMPRESSION_FACTORY: {
+ lsm_compress_factory *p = va_arg(ap, lsm_compress_factory *);
+ if( pDb->factory.xFree ){
+ /* Invoke any destructor belonging to the current factory. */
+ pDb->factory.xFree(pDb->factory.pCtx);
+ }
+ memcpy(&pDb->factory, p, sizeof(lsm_compress_factory));
+ break;
+ }
+
+ case LSM_CONFIG_GET_COMPRESSION: {
+ lsm_compress *p = va_arg(ap, lsm_compress *);
+ memcpy(p, &pDb->compress, sizeof(lsm_compress));
+ break;
+ }
+
+ default:
+ rc = LSM_MISUSE;
+ break;
+ }
+
+ va_end(ap);
+ return rc;
+}
+
+void lsmAppendSegmentList(LsmString *pStr, char *zPre, Segment *pSeg){
+ lsmStringAppendf(pStr, "%s{%d %d %d %d}", zPre,
+ pSeg->iFirst, pSeg->iLastPg, pSeg->iRoot, pSeg->nSize
+ );
+}
+
+static int infoGetWorker(lsm_db *pDb, Snapshot **pp, int *pbUnlock){
+ int rc = LSM_OK;
+
+ assert( *pbUnlock==0 );
+ if( !pDb->pWorker ){
+ rc = lsmBeginWork(pDb);
+ if( rc!=LSM_OK ) return rc;
+ *pbUnlock = 1;
+ }
+ if( pp ) *pp = pDb->pWorker;
+ return rc;
+}
+
+static void infoFreeWorker(lsm_db *pDb, int bUnlock){
+ if( bUnlock ){
+ int rcdummy = LSM_BUSY;
+ lsmFinishWork(pDb, 0, &rcdummy);
+ }
+}
+
+int lsmStructList(
+ lsm_db *pDb, /* Database handle */
+ char **pzOut /* OUT: Nul-terminated string (tcl list) */
+){
+ Level *pTopLevel = 0; /* Top level of snapshot to report on */
+ int rc = LSM_OK;
+ Level *p;
+ LsmString s;
+ Snapshot *pWorker; /* Worker snapshot */
+ int bUnlock = 0;
+
+ /* Obtain the worker snapshot */
+ rc = infoGetWorker(pDb, &pWorker, &bUnlock);
+ if( rc!=LSM_OK ) return rc;
+
+ /* Format the contents of the snapshot as text */
+ pTopLevel = lsmDbSnapshotLevel(pWorker);
+ lsmStringInit(&s, pDb->pEnv);
+ for(p=pTopLevel; rc==LSM_OK && p; p=p->pNext){
+ int i;
+ lsmStringAppendf(&s, "%s{%d", (s.n ? " " : ""), (int)p->iAge);
+ lsmAppendSegmentList(&s, " ", &p->lhs);
+ for(i=0; rc==LSM_OK && inRight; i++){
+ lsmAppendSegmentList(&s, " ", &p->aRhs[i]);
+ }
+ lsmStringAppend(&s, "}", 1);
+ }
+ rc = s.n>=0 ? LSM_OK : LSM_NOMEM;
+
+ /* Release the snapshot and return */
+ infoFreeWorker(pDb, bUnlock);
+ *pzOut = s.z;
+ return rc;
+}
+
+static int infoFreelistCb(void *pCtx, int iBlk, i64 iSnapshot){
+ LsmString *pStr = (LsmString *)pCtx;
+ lsmStringAppendf(pStr, "%s{%d %lld}", (pStr->n?" ":""), iBlk, iSnapshot);
+ return 0;
+}
+
+int lsmInfoFreelist(lsm_db *pDb, char **pzOut){
+ Snapshot *pWorker; /* Worker snapshot */
+ int bUnlock = 0;
+ LsmString s;
+ int rc;
+
+ /* Obtain the worker snapshot */
+ rc = infoGetWorker(pDb, &pWorker, &bUnlock);
+ if( rc!=LSM_OK ) return rc;
+
+ lsmStringInit(&s, pDb->pEnv);
+ rc = lsmWalkFreelist(pDb, 0, infoFreelistCb, &s);
+ if( rc!=LSM_OK ){
+ lsmFree(pDb->pEnv, s.z);
+ }else{
+ *pzOut = s.z;
+ }
+
+ /* Release the snapshot and return */
+ infoFreeWorker(pDb, bUnlock);
+ return rc;
+}
+
+static int infoTreeSize(lsm_db *db, int *pnOldKB, int *pnNewKB){
+ ShmHeader *pShm = db->pShmhdr;
+ TreeHeader *p = &pShm->hdr1;
+
+ /* The following code suffers from two race conditions, as it accesses and
+ ** trusts the contents of shared memory without verifying checksums:
+ **
+ ** * The two values read - TreeHeader.root.nByte and oldroot.nByte - are
+ ** 32-bit fields. It is assumed that reading from one of these
+ ** is atomic - that it is not possible to read a partially written
+ ** garbage value. However the two values may be mutually inconsistent.
+ **
+ ** * TreeHeader.iLogOff is a 64-bit value. And lsmCheckpointLogOffset()
+ ** reads a 64-bit value from a snapshot stored in shared memory. It
+ ** is assumed that in each case it is possible to read a partially
+ ** written garbage value. If this occurs, then the value returned
+ ** for the size of the "old" tree may reflect the size of an "old"
+ ** tree that was recently flushed to disk.
+ **
+ ** Given the context in which this function is called (as a result of an
+ ** lsm_info(LSM_INFO_TREE_SIZE) request), neither of these are considered to
+ ** be problems.
+ */
+ *pnNewKB = ((int)p->root.nByte + 1023) / 1024;
+ if( p->iOldShmid ){
+ if( p->iOldLog==lsmCheckpointLogOffset(pShm->aSnap1) ){
+ *pnOldKB = 0;
+ }else{
+ *pnOldKB = ((int)p->oldroot.nByte + 1023) / 1024;
+ }
+ }else{
+ *pnOldKB = 0;
+ }
+
+ return LSM_OK;
+}
+
+int lsm_info(lsm_db *pDb, int eParam, ...){
+ int rc = LSM_OK;
+ va_list ap;
+ va_start(ap, eParam);
+
+ switch( eParam ){
+ case LSM_INFO_NWRITE: {
+ int *piVal = va_arg(ap, int *);
+ *piVal = lsmFsNWrite(pDb->pFS);
+ break;
+ }
+
+ case LSM_INFO_NREAD: {
+ int *piVal = va_arg(ap, int *);
+ *piVal = lsmFsNRead(pDb->pFS);
+ break;
+ }
+
+ case LSM_INFO_DB_STRUCTURE: {
+ char **pzVal = va_arg(ap, char **);
+ rc = lsmStructList(pDb, pzVal);
+ break;
+ }
+
+ case LSM_INFO_ARRAY_STRUCTURE: {
+ Pgno pgno = va_arg(ap, Pgno);
+ char **pzVal = va_arg(ap, char **);
+ rc = lsmInfoArrayStructure(pDb, 0, pgno, pzVal);
+ break;
+ }
+
+ case LSM_INFO_ARRAY_PAGES: {
+ Pgno pgno = va_arg(ap, Pgno);
+ char **pzVal = va_arg(ap, char **);
+ rc = lsmInfoArrayPages(pDb, pgno, pzVal);
+ break;
+ }
+
+ case LSM_INFO_PAGE_HEX_DUMP:
+ case LSM_INFO_PAGE_ASCII_DUMP: {
+ Pgno pgno = va_arg(ap, Pgno);
+ char **pzVal = va_arg(ap, char **);
+ int bUnlock = 0;
+ rc = infoGetWorker(pDb, 0, &bUnlock);
+ if( rc==LSM_OK ){
+ int bHex = (eParam==LSM_INFO_PAGE_HEX_DUMP);
+ rc = lsmInfoPageDump(pDb, pgno, bHex, pzVal);
+ }
+ infoFreeWorker(pDb, bUnlock);
+ break;
+ }
+
+ case LSM_INFO_LOG_STRUCTURE: {
+ char **pzVal = va_arg(ap, char **);
+ rc = lsmInfoLogStructure(pDb, pzVal);
+ break;
+ }
+
+ case LSM_INFO_FREELIST: {
+ char **pzVal = va_arg(ap, char **);
+ rc = lsmInfoFreelist(pDb, pzVal);
+ break;
+ }
+
+ case LSM_INFO_CHECKPOINT_SIZE: {
+ int *pnKB = va_arg(ap, int *);
+ rc = lsmCheckpointSize(pDb, pnKB);
+ break;
+ }
+
+ case LSM_INFO_TREE_SIZE: {
+ int *pnOld = va_arg(ap, int *);
+ int *pnNew = va_arg(ap, int *);
+ rc = infoTreeSize(pDb, pnOld, pnNew);
+ break;
+ }
+
+ case LSM_INFO_COMPRESSION_ID: {
+ unsigned int *piOut = va_arg(ap, unsigned int *);
+ if( pDb->pClient ){
+ *piOut = pDb->pClient->iCmpId;
+ }else{
+ rc = lsmInfoCompressionId(pDb, piOut);
+ }
+ break;
+ }
+
+ default:
+ rc = LSM_MISUSE;
+ break;
+ }
+
+ va_end(ap);
+ return rc;
+}
+
+static int doWriteOp(
+ lsm_db *pDb,
+ int bDeleteRange,
+ const void *pKey, int nKey, /* Key to write or delete */
+ const void *pVal, int nVal /* Value to write. Or nVal==-1 for a delete */
+){
+ int rc = LSM_OK; /* Return code */
+ int bCommit = 0; /* True to commit before returning */
+
+ if( pDb->nTransOpen==0 ){
+ bCommit = 1;
+ rc = lsm_begin(pDb, 1);
+ }
+
+ if( rc==LSM_OK ){
+ int eType = (bDeleteRange ? LSM_DRANGE : (nVal>=0?LSM_WRITE:LSM_DELETE));
+ rc = lsmLogWrite(pDb, eType, (void *)pKey, nKey, (void *)pVal, nVal);
+ }
+
+ lsmSortedSaveTreeCursors(pDb);
+
+ if( rc==LSM_OK ){
+ int pgsz = lsmFsPageSize(pDb->pFS);
+ int nQuant = LSM_AUTOWORK_QUANT * pgsz;
+ int nBefore;
+ int nAfter;
+ int nDiff;
+
+ if( nQuant>pDb->nTreeLimit ){
+ nQuant = pDb->nTreeLimit;
+ }
+
+ nBefore = lsmTreeSize(pDb);
+ if( bDeleteRange ){
+ rc = lsmTreeDelete(pDb, (void *)pKey, nKey, (void *)pVal, nVal);
+ }else{
+ rc = lsmTreeInsert(pDb, (void *)pKey, nKey, (void *)pVal, nVal);
+ }
+
+ nAfter = lsmTreeSize(pDb);
+ nDiff = (nAfter/nQuant) - (nBefore/nQuant);
+ if( rc==LSM_OK && pDb->bAutowork && nDiff!=0 ){
+ rc = lsmSortedAutoWork(pDb, nDiff * LSM_AUTOWORK_QUANT);
+ }
+ }
+
+ /* If a transaction was opened at the start of this function, commit it.
+ ** Or, if an error has occurred, roll it back. */
+ if( bCommit ){
+ if( rc==LSM_OK ){
+ rc = lsm_commit(pDb, 0);
+ }else{
+ lsm_rollback(pDb, 0);
+ }
+ }
+
+ return rc;
+}
+
+/*
+** Write a new value into the database.
+*/
+int lsm_insert(
+ lsm_db *db, /* Database connection */
+ const void *pKey, int nKey, /* Key to write or delete */
+ const void *pVal, int nVal /* Value to write. Or nVal==-1 for a delete */
+){
+ return doWriteOp(db, 0, pKey, nKey, pVal, nVal);
+}
+
+/*
+** Delete a value from the database.
+*/
+int lsm_delete(lsm_db *db, const void *pKey, int nKey){
+ return doWriteOp(db, 0, pKey, nKey, 0, -1);
+}
+
+/*
+** Delete a range of database keys.
+*/
+int lsm_delete_range(
+ lsm_db *db, /* Database handle */
+ const void *pKey1, int nKey1, /* Lower bound of range to delete */
+ const void *pKey2, int nKey2 /* Upper bound of range to delete */
+){
+ int rc = LSM_OK;
+ if( db->xCmp((void *)pKey1, nKey1, (void *)pKey2, nKey2)<0 ){
+ rc = doWriteOp(db, 1, pKey1, nKey1, pKey2, nKey2);
+ }
+ return rc;
+}
+
+/*
+** Open a new cursor handle.
+**
+** If there are currently no other open cursor handles, and no open write
+** transaction, open a read transaction here.
+*/
+int lsm_csr_open(lsm_db *pDb, lsm_cursor **ppCsr){
+ int rc = LSM_OK; /* Return code */
+ MultiCursor *pCsr = 0; /* New cursor object */
+
+ /* Open a read transaction if one is not already open. */
+ assert_db_state(pDb);
+
+ if( pDb->pShmhdr==0 ){
+ assert( pDb->bReadonly );
+ rc = lsmBeginRoTrans(pDb);
+ }else if( pDb->iReader<0 ){
+ rc = lsmBeginReadTrans(pDb);
+ }
+
+ /* Allocate the multi-cursor. */
+ if( rc==LSM_OK ){
+ rc = lsmMCursorNew(pDb, &pCsr);
+ }
+
+ /* If an error has occured, set the output to NULL and delete any partially
+ ** allocated cursor. If this means there are no open cursors, release the
+ ** client snapshot. */
+ if( rc!=LSM_OK ){
+ lsmMCursorClose(pCsr, 0);
+ dbReleaseClientSnapshot(pDb);
+ }
+
+ assert_db_state(pDb);
+ *ppCsr = (lsm_cursor *)pCsr;
+ return rc;
+}
+
+/*
+** Close a cursor opened using lsm_csr_open().
+*/
+int lsm_csr_close(lsm_cursor *p){
+ if( p ){
+ lsm_db *pDb = lsmMCursorDb((MultiCursor *)p);
+ assert_db_state(pDb);
+ lsmMCursorClose((MultiCursor *)p, 1);
+ dbReleaseClientSnapshot(pDb);
+ assert_db_state(pDb);
+ }
+ return LSM_OK;
+}
+
+/*
+** Attempt to seek the cursor to the database entry specified by pKey/nKey.
+** If an error occurs (e.g. an OOM or IO error), return an LSM error code.
+** Otherwise, return LSM_OK.
+*/
+int lsm_csr_seek(lsm_cursor *pCsr, const void *pKey, int nKey, int eSeek){
+ return lsmMCursorSeek((MultiCursor *)pCsr, 0, (void *)pKey, nKey, eSeek);
+}
+
+int lsm_csr_next(lsm_cursor *pCsr){
+ return lsmMCursorNext((MultiCursor *)pCsr);
+}
+
+int lsm_csr_prev(lsm_cursor *pCsr){
+ return lsmMCursorPrev((MultiCursor *)pCsr);
+}
+
+int lsm_csr_first(lsm_cursor *pCsr){
+ return lsmMCursorFirst((MultiCursor *)pCsr);
+}
+
+int lsm_csr_last(lsm_cursor *pCsr){
+ return lsmMCursorLast((MultiCursor *)pCsr);
+}
+
+int lsm_csr_valid(lsm_cursor *pCsr){
+ return lsmMCursorValid((MultiCursor *)pCsr);
+}
+
+int lsm_csr_key(lsm_cursor *pCsr, const void **ppKey, int *pnKey){
+ return lsmMCursorKey((MultiCursor *)pCsr, (void **)ppKey, pnKey);
+}
+
+int lsm_csr_value(lsm_cursor *pCsr, const void **ppVal, int *pnVal){
+ return lsmMCursorValue((MultiCursor *)pCsr, (void **)ppVal, pnVal);
+}
+
+void lsm_config_log(
+ lsm_db *pDb,
+ void (*xLog)(void *, int, const char *),
+ void *pCtx
+){
+ pDb->xLog = xLog;
+ pDb->pLogCtx = pCtx;
+}
+
+void lsm_config_work_hook(
+ lsm_db *pDb,
+ void (*xWork)(lsm_db *, void *),
+ void *pCtx
+){
+ pDb->xWork = xWork;
+ pDb->pWorkCtx = pCtx;
+}
+
+void lsmLogMessage(lsm_db *pDb, int rc, const char *zFormat, ...){
+ if( pDb->xLog ){
+ LsmString s;
+ va_list ap, ap2;
+ lsmStringInit(&s, pDb->pEnv);
+ va_start(ap, zFormat);
+ va_start(ap2, zFormat);
+ lsmStringVAppendf(&s, zFormat, ap, ap2);
+ va_end(ap);
+ va_end(ap2);
+ pDb->xLog(pDb->pLogCtx, rc, s.z);
+ lsmStringClear(&s);
+ }
+}
+
+int lsm_begin(lsm_db *pDb, int iLevel){
+ int rc;
+
+ assert_db_state( pDb );
+ rc = (pDb->bReadonly ? LSM_READONLY : LSM_OK);
+
+ /* A value less than zero means open one more transaction. */
+ if( iLevel<0 ) iLevel = pDb->nTransOpen + 1;
+ if( iLevel>pDb->nTransOpen ){
+ int i;
+
+ /* Extend the pDb->aTrans[] array if required. */
+ if( rc==LSM_OK && pDb->nTransAllocpEnv, pDb->aTrans, nByte);
+ if( !aNew ){
+ rc = LSM_NOMEM;
+ }else{
+ nByte = sizeof(TransMark) * (iLevel+1 - pDb->nTransAlloc);
+ memset(&aNew[pDb->nTransAlloc], 0, nByte);
+ pDb->nTransAlloc = iLevel+1;
+ pDb->aTrans = aNew;
+ }
+ }
+
+ if( rc==LSM_OK && pDb->nTransOpen==0 ){
+ rc = lsmBeginWriteTrans(pDb);
+ }
+
+ if( rc==LSM_OK ){
+ for(i=pDb->nTransOpen; iaTrans[i].tree);
+ lsmLogTell(pDb, &pDb->aTrans[i].log);
+ }
+ pDb->nTransOpen = iLevel;
+ }
+ }
+
+ return rc;
+}
+
+int lsm_commit(lsm_db *pDb, int iLevel){
+ int rc = LSM_OK;
+
+ assert_db_state( pDb );
+
+ /* A value less than zero means close the innermost nested transaction. */
+ if( iLevel<0 ) iLevel = LSM_MAX(0, pDb->nTransOpen - 1);
+
+ if( iLevelnTransOpen ){
+ if( iLevel==0 ){
+ int rc2;
+ /* Commit the transaction to disk. */
+ if( rc==LSM_OK ) rc = lsmLogCommit(pDb);
+ if( rc==LSM_OK && pDb->eSafety==LSM_SAFETY_FULL ){
+ rc = lsmFsSyncLog(pDb->pFS);
+ }
+ rc2 = lsmFinishWriteTrans(pDb, (rc==LSM_OK));
+ if( rc==LSM_OK ) rc = rc2;
+ }
+ pDb->nTransOpen = iLevel;
+ }
+ dbReleaseClientSnapshot(pDb);
+ return rc;
+}
+
+int lsm_rollback(lsm_db *pDb, int iLevel){
+ int rc = LSM_OK;
+ assert_db_state( pDb );
+
+ if( pDb->nTransOpen ){
+ /* A value less than zero means close the innermost nested transaction. */
+ if( iLevel<0 ) iLevel = LSM_MAX(0, pDb->nTransOpen - 1);
+
+ if( iLevel<=pDb->nTransOpen ){
+ TransMark *pMark = &pDb->aTrans[(iLevel==0 ? 0 : iLevel-1)];
+ lsmTreeRollback(pDb, &pMark->tree);
+ if( iLevel ) lsmLogSeek(pDb, &pMark->log);
+ pDb->nTransOpen = iLevel;
+ }
+
+ if( pDb->nTransOpen==0 ){
+ lsmFinishWriteTrans(pDb, 0);
+ }
+ dbReleaseClientSnapshot(pDb);
+ }
+
+ return rc;
+}
+
+int lsm_get_user_version(lsm_db *pDb, unsigned int *piUsr){
+ int rc = LSM_OK; /* Return code */
+
+ /* Open a read transaction if one is not already open. */
+ assert_db_state(pDb);
+ if( pDb->pShmhdr==0 ){
+ assert( pDb->bReadonly );
+ rc = lsmBeginRoTrans(pDb);
+ }else if( pDb->iReader<0 ){
+ rc = lsmBeginReadTrans(pDb);
+ }
+
+ /* Allocate the multi-cursor. */
+ if( rc==LSM_OK ){
+ *piUsr = pDb->treehdr.iUsrVersion;
+ }
+
+ dbReleaseClientSnapshot(pDb);
+ assert_db_state(pDb);
+ return rc;
+}
+
+int lsm_set_user_version(lsm_db *pDb, unsigned int iUsr){
+ int rc = LSM_OK; /* Return code */
+ int bCommit = 0; /* True to commit before returning */
+
+ if( pDb->nTransOpen==0 ){
+ bCommit = 1;
+ rc = lsm_begin(pDb, 1);
+ }
+
+ if( rc==LSM_OK ){
+ pDb->treehdr.iUsrVersion = iUsr;
+ }
+
+ /* If a transaction was opened at the start of this function, commit it.
+ ** Or, if an error has occurred, roll it back. */
+ if( bCommit ){
+ if( rc==LSM_OK ){
+ rc = lsm_commit(pDb, 0);
+ }else{
+ lsm_rollback(pDb, 0);
+ }
+ }
+
+ return rc;
+}
diff --git a/ext/lsm1/lsm_mem.c b/ext/lsm1/lsm_mem.c
new file mode 100644
index 0000000..13dd9fe
--- /dev/null
+++ b/ext/lsm1/lsm_mem.c
@@ -0,0 +1,104 @@
+/*
+** 2011-08-18
+**
+** The author disclaims copyright to this source code. In place of
+** a legal notice, here is a blessing:
+**
+** May you do good and not evil.
+** May you find forgiveness for yourself and forgive others.
+** May you share freely, never taking more than you give.
+**
+*************************************************************************
+**
+** Helper routines for memory allocation.
+*/
+#include "lsmInt.h"
+
+/*
+** The following routines are called internally by LSM sub-routines. In
+** this case a valid environment pointer must be supplied.
+*/
+void *lsmMalloc(lsm_env *pEnv, size_t N){
+ assert( pEnv );
+ return pEnv->xMalloc(pEnv, N);
+}
+void lsmFree(lsm_env *pEnv, void *p){
+ assert( pEnv );
+ pEnv->xFree(pEnv, p);
+}
+void *lsmRealloc(lsm_env *pEnv, void *p, size_t N){
+ assert( pEnv );
+ return pEnv->xRealloc(pEnv, p, N);
+}
+
+/*
+** Core memory allocation routines for LSM.
+*/
+void *lsm_malloc(lsm_env *pEnv, size_t N){
+ return lsmMalloc(pEnv ? pEnv : lsm_default_env(), N);
+}
+void lsm_free(lsm_env *pEnv, void *p){
+ lsmFree(pEnv ? pEnv : lsm_default_env(), p);
+}
+void *lsm_realloc(lsm_env *pEnv, void *p, size_t N){
+ return lsmRealloc(pEnv ? pEnv : lsm_default_env(), p, N);
+}
+
+void *lsmMallocZero(lsm_env *pEnv, size_t N){
+ void *pRet;
+ assert( pEnv );
+ pRet = lsmMalloc(pEnv, N);
+ if( pRet ) memset(pRet, 0, N);
+ return pRet;
+}
+
+void *lsmMallocRc(lsm_env *pEnv, size_t N, int *pRc){
+ void *pRet = 0;
+ if( *pRc==LSM_OK ){
+ pRet = lsmMalloc(pEnv, N);
+ if( pRet==0 ){
+ *pRc = LSM_NOMEM_BKPT;
+ }
+ }
+ return pRet;
+}
+
+void *lsmMallocZeroRc(lsm_env *pEnv, size_t N, int *pRc){
+ void *pRet = 0;
+ if( *pRc==LSM_OK ){
+ pRet = lsmMallocZero(pEnv, N);
+ if( pRet==0 ){
+ *pRc = LSM_NOMEM_BKPT;
+ }
+ }
+ return pRet;
+}
+
+void *lsmReallocOrFree(lsm_env *pEnv, void *p, size_t N){
+ void *pNew;
+ pNew = lsm_realloc(pEnv, p, N);
+ if( !pNew ) lsm_free(pEnv, p);
+ return pNew;
+}
+
+void *lsmReallocOrFreeRc(lsm_env *pEnv, void *p, size_t N, int *pRc){
+ void *pRet = 0;
+ if( *pRc ){
+ lsmFree(pEnv, p);
+ }else{
+ pRet = lsmReallocOrFree(pEnv, p, N);
+ if( !pRet ) *pRc = LSM_NOMEM_BKPT;
+ }
+ return pRet;
+}
+
+char *lsmMallocStrdup(lsm_env *pEnv, const char *zIn){
+ int nByte;
+ char *zRet;
+ nByte = strlen(zIn);
+ zRet = lsmMalloc(pEnv, nByte+1);
+ if( zRet ){
+ memcpy(zRet, zIn, nByte+1);
+ }
+ return zRet;
+}
diff --git a/ext/lsm1/lsm_mutex.c b/ext/lsm1/lsm_mutex.c
new file mode 100644
index 0000000..cb99b2a
--- /dev/null
+++ b/ext/lsm1/lsm_mutex.c
@@ -0,0 +1,88 @@
+/*
+** 2012-01-30
+**
+** The author disclaims copyright to this source code. In place of
+** a legal notice, here is a blessing:
+**
+** May you do good and not evil.
+** May you find forgiveness for yourself and forgive others.
+** May you share freely, never taking more than you give.
+**
+*************************************************************************
+**
+** Mutex functions for LSM.
+*/
+#include "lsmInt.h"
+
+/*
+** Allocate a new mutex.
+*/
+int lsmMutexNew(lsm_env *pEnv, lsm_mutex **ppNew){
+ return pEnv->xMutexNew(pEnv, ppNew);
+}
+
+/*
+** Return a handle for one of the static mutexes.
+*/
+int lsmMutexStatic(lsm_env *pEnv, int iMutex, lsm_mutex **ppStatic){
+ return pEnv->xMutexStatic(pEnv, iMutex, ppStatic);
+}
+
+/*
+** Free a mutex allocated by lsmMutexNew().
+*/
+void lsmMutexDel(lsm_env *pEnv, lsm_mutex *pMutex){
+ if( pMutex ) pEnv->xMutexDel(pMutex);
+}
+
+/*
+** Enter a mutex.
+*/
+void lsmMutexEnter(lsm_env *pEnv, lsm_mutex *pMutex){
+ pEnv->xMutexEnter(pMutex);
+}
+
+/*
+** Attempt to enter a mutex, but do not block. If successful, return zero.
+** Otherwise, if the mutex is already held by some other thread and is not
+** entered, return non zero.
+**
+** Each successful call to this function must be matched by a call to
+** lsmMutexLeave().
+*/
+int lsmMutexTry(lsm_env *pEnv, lsm_mutex *pMutex){
+ return pEnv->xMutexTry(pMutex);
+}
+
+/*
+** Leave a mutex.
+*/
+void lsmMutexLeave(lsm_env *pEnv, lsm_mutex *pMutex){
+ pEnv->xMutexLeave(pMutex);
+}
+
+#ifndef NDEBUG
+/*
+** Return non-zero if the mutex passed as the second argument is held
+** by the calling thread, or zero otherwise. If the implementation is not
+** able to tell if the mutex is held by the caller, it should return
+** non-zero.
+**
+** This function is only used as part of assert() statements.
+*/
+int lsmMutexHeld(lsm_env *pEnv, lsm_mutex *pMutex){
+ return pEnv->xMutexHeld ? pEnv->xMutexHeld(pMutex) : 1;
+}
+
+/*
+** Return non-zero if the mutex passed as the second argument is not
+** held by the calling thread, or zero otherwise. If the implementation
+** is not able to tell if the mutex is held by the caller, it should
+** return non-zero.
+**
+** This function is only used as part of assert() statements.
+*/
+int lsmMutexNotHeld(lsm_env *pEnv, lsm_mutex *pMutex){
+ return pEnv->xMutexNotHeld ? pEnv->xMutexNotHeld(pMutex) : 1;
+}
+#endif
diff --git a/ext/lsm1/lsm_shared.c b/ext/lsm1/lsm_shared.c
new file mode 100644
index 0000000..83e44b4
--- /dev/null
+++ b/ext/lsm1/lsm_shared.c
@@ -0,0 +1,1980 @@
+/*
+** 2012-01-23
+**
+** The author disclaims copyright to this source code. In place of
+** a legal notice, here is a blessing:
+**
+** May you do good and not evil.
+** May you find forgiveness for yourself and forgive others.
+** May you share freely, never taking more than you give.
+**
+*************************************************************************
+**
+** Utilities used to help multiple LSM clients to coexist within the
+** same process space.
+*/
+#include "lsmInt.h"
+
+/*
+** Global data. All global variables used by code in this file are grouped
+** into the following structure instance.
+**
+** pDatabase:
+** Linked list of all Database objects allocated within this process.
+** This list may not be traversed without holding the global mutex (see
+** functions enterGlobalMutex() and leaveGlobalMutex()).
+*/
+static struct SharedData {
+ Database *pDatabase; /* Linked list of all Database objects */
+} gShared;
+
+/*
+** Database structure. There is one such structure for each distinct
+** database accessed by this process. They are stored in the singly linked
+** list starting at global variable gShared.pDatabase. Database objects are
+** reference counted. Once the number of connections to the associated
+** database drops to zero, they are removed from the linked list and deleted.
+**
+** pFile:
+** In multi-process mode, this file descriptor is used to obtain locks
+** and to access shared-memory. In single process mode, its only job is
+** to hold the exclusive lock on the file.
+**
+*/
+struct Database {
+ /* Protected by the global mutex (enterGlobalMutex/leaveGlobalMutex): */
+ char *zName; /* Canonical path to database file */
+ int nName; /* strlen(zName) */
+ int nDbRef; /* Number of associated lsm_db handles */
+ Database *pDbNext; /* Next Database structure in global list */
+
+ /* Protected by the local mutex (pClientMutex) */
+ int bReadonly; /* True if Database.pFile is read-only */
+ int bMultiProc; /* True if running in multi-process mode */
+ lsm_file *pFile; /* Used for locks/shm in multi-proc mode */
+ LsmFile *pLsmFile; /* List of deferred closes */
+ lsm_mutex *pClientMutex; /* Protects the apShmChunk[] and pConn */
+ int nShmChunk; /* Number of entries in apShmChunk[] array */
+ void **apShmChunk; /* Array of "shared" memory regions */
+ lsm_db *pConn; /* List of connections to this db. */
+};
+
+/*
+** Functions to enter and leave the global mutex. This mutex is used
+** to protect the global linked-list headed at gShared.pDatabase.
+*/
+static int enterGlobalMutex(lsm_env *pEnv){
+ lsm_mutex *p;
+ int rc = lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p);
+ if( rc==LSM_OK ) lsmMutexEnter(pEnv, p);
+ return rc;
+}
+static void leaveGlobalMutex(lsm_env *pEnv){
+ lsm_mutex *p;
+ lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p);
+ lsmMutexLeave(pEnv, p);
+}
+
+#ifdef LSM_DEBUG
+static int holdingGlobalMutex(lsm_env *pEnv){
+ lsm_mutex *p;
+ lsmMutexStatic(pEnv, LSM_MUTEX_GLOBAL, &p);
+ return lsmMutexHeld(pEnv, p);
+}
+#endif
+
+#if 0
+static void assertNotInFreelist(Freelist *p, int iBlk){
+ int i;
+ for(i=0; inEntry; i++){
+ assert( p->aEntry[i].iBlk!=iBlk );
+ }
+}
+#else
+# define assertNotInFreelist(x,y)
+#endif
+
+/*
+** Append an entry to the free-list. If (iId==-1), this is a delete.
+*/
+int freelistAppend(lsm_db *db, u32 iBlk, i64 iId){
+ lsm_env *pEnv = db->pEnv;
+ Freelist *p;
+ int i;
+
+ assert( iId==-1 || iId>=0 );
+ p = db->bUseFreelist ? db->pFreelist : &db->pWorker->freelist;
+
+ /* Extend the space allocated for the freelist, if required */
+ assert( p->nAlloc>=p->nEntry );
+ if( p->nAlloc==p->nEntry ){
+ int nNew;
+ int nByte;
+ FreelistEntry *aNew;
+
+ nNew = (p->nAlloc==0 ? 4 : p->nAlloc*2);
+ nByte = sizeof(FreelistEntry) * nNew;
+ aNew = (FreelistEntry *)lsmRealloc(pEnv, p->aEntry, nByte);
+ if( !aNew ) return LSM_NOMEM_BKPT;
+ p->nAlloc = nNew;
+ p->aEntry = aNew;
+ }
+
+ for(i=0; inEntry; i++){
+ assert( i==0 || p->aEntry[i].iBlk > p->aEntry[i-1].iBlk );
+ if( p->aEntry[i].iBlk>=iBlk ) break;
+ }
+
+ if( inEntry && p->aEntry[i].iBlk==iBlk ){
+ /* Clobber an existing entry */
+ p->aEntry[i].iId = iId;
+ }else{
+ /* Insert a new entry into the list */
+ int nByte = sizeof(FreelistEntry)*(p->nEntry-i);
+ memmove(&p->aEntry[i+1], &p->aEntry[i], nByte);
+ p->aEntry[i].iBlk = iBlk;
+ p->aEntry[i].iId = iId;
+ p->nEntry++;
+ }
+
+ return LSM_OK;
+}
+
+/*
+** This function frees all resources held by the Database structure passed
+** as the only argument.
+*/
+static void freeDatabase(lsm_env *pEnv, Database *p){
+ assert( holdingGlobalMutex(pEnv) );
+ if( p ){
+ /* Free the mutexes */
+ lsmMutexDel(pEnv, p->pClientMutex);
+
+ if( p->pFile ){
+ lsmEnvClose(pEnv, p->pFile);
+ }
+
+ /* Free the array of shm pointers */
+ lsmFree(pEnv, p->apShmChunk);
+
+ /* Free the memory allocated for the Database struct itself */
+ lsmFree(pEnv, p);
+ }
+}
+
+typedef struct DbTruncateCtx DbTruncateCtx;
+struct DbTruncateCtx {
+ int nBlock;
+ i64 iInUse;
+};
+
+static int dbTruncateCb(void *pCtx, int iBlk, i64 iSnapshot){
+ DbTruncateCtx *p = (DbTruncateCtx *)pCtx;
+ if( iBlk!=p->nBlock || (p->iInUse>=0 && iSnapshot>=p->iInUse) ) return 1;
+ p->nBlock--;
+ return 0;
+}
+
+static int dbTruncate(lsm_db *pDb, i64 iInUse){
+ int rc = LSM_OK;
+#if 0
+ int i;
+ DbTruncateCtx ctx;
+
+ assert( pDb->pWorker );
+ ctx.nBlock = pDb->pWorker->nBlock;
+ ctx.iInUse = iInUse;
+
+ rc = lsmWalkFreelist(pDb, 1, dbTruncateCb, (void *)&ctx);
+ for(i=ctx.nBlock+1; rc==LSM_OK && i<=pDb->pWorker->nBlock; i++){
+ rc = freelistAppend(pDb, i, -1);
+ }
+
+ if( rc==LSM_OK ){
+#ifdef LSM_LOG_FREELIST
+ if( ctx.nBlock!=pDb->pWorker->nBlock ){
+ lsmLogMessage(pDb, 0,
+ "dbTruncate(): truncated db to %d blocks",ctx.nBlock
+ );
+ }
+#endif
+ pDb->pWorker->nBlock = ctx.nBlock;
+ }
+#endif
+ return rc;
+}
+
+
+/*
+** This function is called during database shutdown (when the number of
+** connections drops from one to zero). It truncates the database file
+** to as small a size as possible without truncating away any blocks that
+** contain data.
+*/
+static int dbTruncateFile(lsm_db *pDb){
+ int rc;
+
+ assert( pDb->pWorker==0 );
+ assert( lsmShmAssertLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL) );
+ rc = lsmCheckpointLoadWorker(pDb);
+
+ if( rc==LSM_OK ){
+ DbTruncateCtx ctx;
+
+ /* Walk the database free-block-list in reverse order. Set ctx.nBlock
+ ** to the block number of the last block in the database that actually
+ ** contains data. */
+ ctx.nBlock = pDb->pWorker->nBlock;
+ ctx.iInUse = -1;
+ rc = lsmWalkFreelist(pDb, 1, dbTruncateCb, (void *)&ctx);
+
+ /* If the last block that contains data is not already the last block in
+ ** the database file, truncate the database file so that it is. */
+ if( rc==LSM_OK ){
+ rc = lsmFsTruncateDb(
+ pDb->pFS, (i64)ctx.nBlock*lsmFsBlockSize(pDb->pFS)
+ );
+ }
+ }
+
+ lsmFreeSnapshot(pDb->pEnv, pDb->pWorker);
+ pDb->pWorker = 0;
+ return rc;
+}
+
+static void doDbDisconnect(lsm_db *pDb){
+ int rc;
+
+ if( pDb->bReadonly ){
+ lsmShmLock(pDb, LSM_LOCK_DMS3, LSM_LOCK_UNLOCK, 0);
+ }else{
+ /* Block for an exclusive lock on DMS1. This lock serializes all calls
+ ** to doDbConnect() and doDbDisconnect() across all processes. */
+ rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1);
+ if( rc==LSM_OK ){
+
+ lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_UNLOCK, 0);
+
+ /* Try an exclusive lock on DMS2. If successful, this is the last
+ ** connection to the database. In this case flush the contents of the
+ ** in-memory tree to disk and write a checkpoint. */
+ rc = lsmShmTestLock(pDb, LSM_LOCK_DMS2, 1, LSM_LOCK_EXCL);
+ if( rc==LSM_OK ){
+ rc = lsmShmTestLock(pDb, LSM_LOCK_CHECKPOINTER, 1, LSM_LOCK_EXCL);
+ }
+ if( rc==LSM_OK ){
+ int bReadonly = 0; /* True if there exist read-only conns. */
+
+ /* Flush the in-memory tree, if required. If there is data to flush,
+ ** this will create a new client snapshot in Database.pClient. The
+ ** checkpoint (serialization) of this snapshot may be written to disk
+ ** by the following block.
+ **
+ ** There is no need to take a WRITER lock here. That there are no
+ ** other locks on DMS2 guarantees that there are no other read-write
+ ** connections at this time (and the lock on DMS1 guarantees that
+ ** no new ones may appear).
+ */
+ rc = lsmTreeLoadHeader(pDb, 0);
+ if( rc==LSM_OK && (lsmTreeHasOld(pDb) || lsmTreeSize(pDb)>0) ){
+ rc = lsmFlushTreeToDisk(pDb);
+ }
+
+ /* Now check if there are any read-only connections. If there are,
+ ** then do not truncate the db file or unlink the shared-memory
+ ** region. */
+ if( rc==LSM_OK ){
+ rc = lsmShmTestLock(pDb, LSM_LOCK_DMS3, 1, LSM_LOCK_EXCL);
+ if( rc==LSM_BUSY ){
+ bReadonly = 1;
+ rc = LSM_OK;
+ }
+ }
+
+ /* Write a checkpoint to disk. */
+ if( rc==LSM_OK ){
+ rc = lsmCheckpointWrite(pDb, 0);
+ }
+
+ /* If the checkpoint was written successfully, delete the log file
+ ** and, if possible, truncate the database file. */
+ if( rc==LSM_OK ){
+ int bRotrans = 0;
+ Database *p = pDb->pDatabase;
+
+ /* The log file may only be deleted if there are no clients
+ ** read-only clients running rotrans transactions. */
+ rc = lsmDetectRoTrans(pDb, &bRotrans);
+ if( rc==LSM_OK && bRotrans==0 ){
+ lsmFsCloseAndDeleteLog(pDb->pFS);
+ }
+
+ /* The database may only be truncated if there exist no read-only
+ ** clients - either connected or running rotrans transactions. */
+ if( bReadonly==0 && bRotrans==0 ){
+ lsmFsUnmap(pDb->pFS);
+ dbTruncateFile(pDb);
+ if( p->pFile && p->bMultiProc ){
+ lsmEnvShmUnmap(pDb->pEnv, p->pFile, 1);
+ }
+ }
+ }
+ }
+ }
+
+ if( pDb->iRwclient>=0 ){
+ lsmShmLock(pDb, LSM_LOCK_RWCLIENT(pDb->iRwclient), LSM_LOCK_UNLOCK, 0);
+ pDb->iRwclient = -1;
+ }
+
+ lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
+ }
+ pDb->pShmhdr = 0;
+}
+
+static int doDbConnect(lsm_db *pDb){
+ const int nUsMax = 100000; /* Max value for nUs */
+ int nUs = 1000; /* us to wait between DMS1 attempts */
+ int rc;
+
+ /* Obtain a pointer to the shared-memory header */
+ assert( pDb->pShmhdr==0 );
+ assert( pDb->bReadonly==0 );
+ rc = lsmShmCacheChunks(pDb, 1);
+ if( rc!=LSM_OK ) return rc;
+ pDb->pShmhdr = (ShmHeader *)pDb->apShm[0];
+
+ /* Block for an exclusive lock on DMS1. This lock serializes all calls
+ ** to doDbConnect() and doDbDisconnect() across all processes. */
+ while( 1 ){
+ rc = lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_EXCL, 1);
+ if( rc!=LSM_BUSY ) break;
+ lsmEnvSleep(pDb->pEnv, nUs);
+ nUs = nUs * 2;
+ if( nUs>nUsMax ) nUs = nUsMax;
+ }
+ if( rc!=LSM_OK ){
+ pDb->pShmhdr = 0;
+ return rc;
+ }
+
+ /* Try an exclusive lock on DMS2/DMS3. If successful, this is the first
+ ** and only connection to the database. In this case initialize the
+ ** shared-memory and run log file recovery. */
+ assert( LSM_LOCK_DMS3==1+LSM_LOCK_DMS2 );
+ rc = lsmShmTestLock(pDb, LSM_LOCK_DMS2, 2, LSM_LOCK_EXCL);
+ if( rc==LSM_OK ){
+ memset(pDb->pShmhdr, 0, sizeof(ShmHeader));
+ rc = lsmCheckpointRecover(pDb);
+ if( rc==LSM_OK ){
+ rc = lsmLogRecover(pDb);
+ }
+ if( rc==LSM_OK ){
+ ShmHeader *pShm = pDb->pShmhdr;
+ pShm->aReader[0].iLsmId = lsmCheckpointId(pShm->aSnap1, 0);
+ pShm->aReader[0].iTreeId = pDb->treehdr.iUsedShmid;
+ }
+ }else if( rc==LSM_BUSY ){
+ rc = LSM_OK;
+ }
+
+ /* Take a shared lock on DMS2. In multi-process mode this lock "cannot"
+ ** fail, as connections may only hold an exclusive lock on DMS2 if they
+ ** first hold an exclusive lock on DMS1. And this connection is currently
+ ** holding the exclusive lock on DSM1.
+ **
+ ** However, if some other connection has the database open in single-process
+ ** mode, this operation will fail. In this case, return the error to the
+ ** caller - the attempt to connect to the db has failed.
+ */
+ if( rc==LSM_OK ){
+ rc = lsmShmLock(pDb, LSM_LOCK_DMS2, LSM_LOCK_SHARED, 0);
+ }
+
+ /* If anything went wrong, unlock DMS2. Otherwise, try to take an exclusive
+ ** lock on one of the LSM_LOCK_RWCLIENT() locks. Unlock DMS1 in any case. */
+ if( rc!=LSM_OK ){
+ pDb->pShmhdr = 0;
+ }else{
+ int i;
+ for(i=0; iiRwclient = i;
+ if( rc2!=LSM_BUSY ){
+ rc = rc2;
+ break;
+ }
+ }
+ }
+ lsmShmLock(pDb, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
+
+ return rc;
+}
+
+static int dbOpenSharedFd(lsm_env *pEnv, Database *p, int bRoOk){
+ int rc;
+
+ rc = lsmEnvOpen(pEnv, p->zName, 0, &p->pFile);
+ if( rc==LSM_IOERR && bRoOk ){
+ rc = lsmEnvOpen(pEnv, p->zName, LSM_OPEN_READONLY, &p->pFile);
+ p->bReadonly = 1;
+ }
+
+ return rc;
+}
+
+/*
+** Return a reference to the shared Database handle for the database
+** identified by canonical path zName. If this is the first connection to
+** the named database, a new Database object is allocated. Otherwise, a
+** pointer to an existing object is returned.
+**
+** If successful, *ppDatabase is set to point to the shared Database
+** structure and LSM_OK returned. Otherwise, *ppDatabase is set to NULL
+** and and LSM error code returned.
+**
+** Each successful call to this function should be (eventually) matched
+** by a call to lsmDbDatabaseRelease().
+*/
+int lsmDbDatabaseConnect(
+ lsm_db *pDb, /* Database handle */
+ const char *zName /* Full-path to db file */
+){
+ lsm_env *pEnv = pDb->pEnv;
+ int rc; /* Return code */
+ Database *p = 0; /* Pointer returned via *ppDatabase */
+ int nName = lsmStrlen(zName);
+
+ assert( pDb->pDatabase==0 );
+ rc = enterGlobalMutex(pEnv);
+ if( rc==LSM_OK ){
+
+ /* Search the global list for an existing object. TODO: Need something
+ ** better than the memcmp() below to figure out if a given Database
+ ** object represents the requested file. */
+ for(p=gShared.pDatabase; p; p=p->pDbNext){
+ if( nName==p->nName && 0==memcmp(zName, p->zName, nName) ) break;
+ }
+
+ /* If no suitable Database object was found, allocate a new one. */
+ if( p==0 ){
+ p = (Database *)lsmMallocZeroRc(pEnv, sizeof(Database)+nName+1, &rc);
+
+ /* If the allocation was successful, fill in other fields and
+ ** allocate the client mutex. */
+ if( rc==LSM_OK ){
+ p->bMultiProc = pDb->bMultiProc;
+ p->zName = (char *)&p[1];
+ p->nName = nName;
+ memcpy((void *)p->zName, zName, nName+1);
+ rc = lsmMutexNew(pEnv, &p->pClientMutex);
+ }
+
+ /* If nothing has gone wrong so far, open the shared fd. And if that
+ ** succeeds and this connection requested single-process mode,
+ ** attempt to take the exclusive lock on DMS2. */
+ if( rc==LSM_OK ){
+ int bReadonly = (pDb->bReadonly && pDb->bMultiProc);
+ rc = dbOpenSharedFd(pDb->pEnv, p, bReadonly);
+ }
+
+ if( rc==LSM_OK && p->bMultiProc==0 ){
+ /* Hold an exclusive lock DMS1 while grabbing DMS2. This ensures
+ ** that any ongoing call to doDbDisconnect() (even one in another
+ ** process) is finished before proceeding. */
+ assert( p->bReadonly==0 );
+ rc = lsmEnvLock(pDb->pEnv, p->pFile, LSM_LOCK_DMS1, LSM_LOCK_EXCL);
+ if( rc==LSM_OK ){
+ rc = lsmEnvLock(pDb->pEnv, p->pFile, LSM_LOCK_DMS2, LSM_LOCK_EXCL);
+ lsmEnvLock(pDb->pEnv, p->pFile, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK);
+ }
+ }
+
+ if( rc==LSM_OK ){
+ p->pDbNext = gShared.pDatabase;
+ gShared.pDatabase = p;
+ }else{
+ freeDatabase(pEnv, p);
+ p = 0;
+ }
+ }
+
+ if( p ){
+ p->nDbRef++;
+ }
+ leaveGlobalMutex(pEnv);
+
+ if( p ){
+ lsmMutexEnter(pDb->pEnv, p->pClientMutex);
+ pDb->pNext = p->pConn;
+ p->pConn = pDb;
+ lsmMutexLeave(pDb->pEnv, p->pClientMutex);
+ }
+ }
+
+ pDb->pDatabase = p;
+ if( rc==LSM_OK ){
+ assert( p );
+ rc = lsmFsOpen(pDb, zName, p->bReadonly);
+ }
+
+ /* If the db handle is read-write, then connect to the system now. Run
+ ** recovery as necessary. Or, if this is a read-only database handle,
+ ** defer attempting to connect to the system until a read-transaction
+ ** is opened. */
+ if( pDb->bReadonly==0 ){
+ if( rc==LSM_OK ){
+ rc = lsmFsConfigure(pDb);
+ }
+ if( rc==LSM_OK ){
+ rc = doDbConnect(pDb);
+ }
+ }
+
+ return rc;
+}
+
+static void dbDeferClose(lsm_db *pDb){
+ if( pDb->pFS ){
+ LsmFile *pLsmFile;
+ Database *p = pDb->pDatabase;
+ pLsmFile = lsmFsDeferClose(pDb->pFS);
+ pLsmFile->pNext = p->pLsmFile;
+ p->pLsmFile = pLsmFile;
+ }
+}
+
+LsmFile *lsmDbRecycleFd(lsm_db *db){
+ LsmFile *pRet;
+ Database *p = db->pDatabase;
+ lsmMutexEnter(db->pEnv, p->pClientMutex);
+ if( (pRet = p->pLsmFile)!=0 ){
+ p->pLsmFile = pRet->pNext;
+ }
+ lsmMutexLeave(db->pEnv, p->pClientMutex);
+ return pRet;
+}
+
+/*
+** Release a reference to a Database object obtained from
+** lsmDbDatabaseConnect(). There should be exactly one call to this function
+** for each successful call to Find().
+*/
+void lsmDbDatabaseRelease(lsm_db *pDb){
+ Database *p = pDb->pDatabase;
+ if( p ){
+ lsm_db **ppDb;
+
+ if( pDb->pShmhdr ){
+ doDbDisconnect(pDb);
+ }
+
+ lsmFsUnmap(pDb->pFS);
+ lsmMutexEnter(pDb->pEnv, p->pClientMutex);
+ for(ppDb=&p->pConn; *ppDb!=pDb; ppDb=&((*ppDb)->pNext));
+ *ppDb = pDb->pNext;
+ dbDeferClose(pDb);
+ lsmMutexLeave(pDb->pEnv, p->pClientMutex);
+
+ enterGlobalMutex(pDb->pEnv);
+ p->nDbRef--;
+ if( p->nDbRef==0 ){
+ LsmFile *pIter;
+ LsmFile *pNext;
+ Database **pp;
+
+ /* Remove the Database structure from the linked list. */
+ for(pp=&gShared.pDatabase; *pp!=p; pp=&((*pp)->pDbNext));
+ *pp = p->pDbNext;
+
+ /* If they were allocated from the heap, free the shared memory chunks */
+ if( p->bMultiProc==0 ){
+ int i;
+ for(i=0; inShmChunk; i++){
+ lsmFree(pDb->pEnv, p->apShmChunk[i]);
+ }
+ }
+
+ /* Close any outstanding file descriptors */
+ for(pIter=p->pLsmFile; pIter; pIter=pNext){
+ pNext = pIter->pNext;
+ lsmEnvClose(pDb->pEnv, pIter->pFile);
+ lsmFree(pDb->pEnv, pIter);
+ }
+ freeDatabase(pDb->pEnv, p);
+ }
+ leaveGlobalMutex(pDb->pEnv);
+ }
+}
+
+Level *lsmDbSnapshotLevel(Snapshot *pSnapshot){
+ return pSnapshot->pLevel;
+}
+
+void lsmDbSnapshotSetLevel(Snapshot *pSnap, Level *pLevel){
+ pSnap->pLevel = pLevel;
+}
+
+/* TODO: Shuffle things around to get rid of this */
+static int firstSnapshotInUse(lsm_db *, i64 *);
+
+/*
+** Context object used by the lsmWalkFreelist() utility.
+*/
+typedef struct WalkFreelistCtx WalkFreelistCtx;
+struct WalkFreelistCtx {
+ lsm_db *pDb;
+ int bReverse;
+ Freelist *pFreelist;
+ int iFree;
+ int (*xUsr)(void *, int, i64); /* User callback function */
+ void *pUsrctx; /* User callback context */
+ int bDone; /* Set to true after xUsr() returns true */
+};
+
+/*
+** Callback used by lsmWalkFreelist().
+*/
+static int walkFreelistCb(void *pCtx, int iBlk, i64 iSnapshot){
+ WalkFreelistCtx *p = (WalkFreelistCtx *)pCtx;
+ const int iDir = (p->bReverse ? -1 : 1);
+ Freelist *pFree = p->pFreelist;
+
+ assert( p->bDone==0 );
+ assert( iBlk>=0 );
+ if( pFree ){
+ while( (p->iFree < pFree->nEntry) && p->iFree>=0 ){
+ FreelistEntry *pEntry = &pFree->aEntry[p->iFree];
+ if( (p->bReverse==0 && pEntry->iBlk>(u32)iBlk)
+ || (p->bReverse!=0 && pEntry->iBlk<(u32)iBlk)
+ ){
+ break;
+ }else{
+ p->iFree += iDir;
+ if( pEntry->iId>=0
+ && p->xUsr(p->pUsrctx, pEntry->iBlk, pEntry->iId)
+ ){
+ p->bDone = 1;
+ return 1;
+ }
+ if( pEntry->iBlk==(u32)iBlk ) return 0;
+ }
+ }
+ }
+
+ if( p->xUsr(p->pUsrctx, iBlk, iSnapshot) ){
+ p->bDone = 1;
+ return 1;
+ }
+ return 0;
+}
+
+/*
+** The database handle passed as the first argument must be the worker
+** connection. This function iterates through the contents of the current
+** free block list, invoking the supplied callback once for each list
+** element.
+**
+** The difference between this function and lsmSortedWalkFreelist() is
+** that lsmSortedWalkFreelist() only considers those free-list elements
+** stored within the LSM. This function also merges in any in-memory
+** elements.
+*/
+int lsmWalkFreelist(
+ lsm_db *pDb, /* Database handle (must be worker) */
+ int bReverse, /* True to iterate from largest to smallest */
+ int (*x)(void *, int, i64), /* Callback function */
+ void *pCtx /* First argument to pass to callback */
+){
+ const int iDir = (bReverse ? -1 : 1);
+ int rc;
+ int iCtx;
+
+ WalkFreelistCtx ctx[2];
+
+ ctx[0].pDb = pDb;
+ ctx[0].bReverse = bReverse;
+ ctx[0].pFreelist = &pDb->pWorker->freelist;
+ if( ctx[0].pFreelist && bReverse ){
+ ctx[0].iFree = ctx[0].pFreelist->nEntry-1;
+ }else{
+ ctx[0].iFree = 0;
+ }
+ ctx[0].xUsr = walkFreelistCb;
+ ctx[0].pUsrctx = (void *)&ctx[1];
+ ctx[0].bDone = 0;
+
+ ctx[1].pDb = pDb;
+ ctx[1].bReverse = bReverse;
+ ctx[1].pFreelist = pDb->pFreelist;
+ if( ctx[1].pFreelist && bReverse ){
+ ctx[1].iFree = ctx[1].pFreelist->nEntry-1;
+ }else{
+ ctx[1].iFree = 0;
+ }
+ ctx[1].xUsr = x;
+ ctx[1].pUsrctx = pCtx;
+ ctx[1].bDone = 0;
+
+ rc = lsmSortedWalkFreelist(pDb, bReverse, walkFreelistCb, (void *)&ctx[0]);
+
+ if( ctx[0].bDone==0 ){
+ for(iCtx=0; iCtx<2; iCtx++){
+ int i;
+ WalkFreelistCtx *p = &ctx[iCtx];
+ for(i=p->iFree;
+ p->pFreelist && rc==LSM_OK && ipFreelist->nEntry && i>=0;
+ i += iDir
+ ){
+ FreelistEntry *pEntry = &p->pFreelist->aEntry[i];
+ if( pEntry->iId>=0 && p->xUsr(p->pUsrctx, pEntry->iBlk, pEntry->iId) ){
+ return LSM_OK;
+ }
+ }
+ }
+ }
+
+ return rc;
+}
+
+
+typedef struct FindFreeblockCtx FindFreeblockCtx;
+struct FindFreeblockCtx {
+ i64 iInUse;
+ int iRet;
+ int bNotOne;
+};
+
+static int findFreeblockCb(void *pCtx, int iBlk, i64 iSnapshot){
+ FindFreeblockCtx *p = (FindFreeblockCtx *)pCtx;
+ if( iSnapshotiInUse && (iBlk!=1 || p->bNotOne==0) ){
+ p->iRet = iBlk;
+ return 1;
+ }
+ return 0;
+}
+
+static int findFreeblock(lsm_db *pDb, i64 iInUse, int bNotOne, int *piRet){
+ int rc; /* Return code */
+ FindFreeblockCtx ctx; /* Context object */
+
+ ctx.iInUse = iInUse;
+ ctx.iRet = 0;
+ ctx.bNotOne = bNotOne;
+ rc = lsmWalkFreelist(pDb, 0, findFreeblockCb, (void *)&ctx);
+ *piRet = ctx.iRet;
+
+ return rc;
+}
+
+/*
+** Allocate a new database file block to write data to, either by extending
+** the database file or by recycling a free-list entry. The worker snapshot
+** must be held in order to call this function.
+**
+** If successful, *piBlk is set to the block number allocated and LSM_OK is
+** returned. Otherwise, *piBlk is zeroed and an lsm error code returned.
+*/
+int lsmBlockAllocate(lsm_db *pDb, int iBefore, int *piBlk){
+ Snapshot *p = pDb->pWorker;
+ int iRet = 0; /* Block number of allocated block */
+ int rc = LSM_OK;
+ i64 iInUse = 0; /* Snapshot id still in use */
+ i64 iSynced = 0; /* Snapshot id synced to disk */
+
+ assert( p );
+
+#ifdef LSM_LOG_FREELIST
+ {
+ static int nCall = 0;
+ char *zFree = 0;
+ nCall++;
+ rc = lsmInfoFreelist(pDb, &zFree);
+ if( rc!=LSM_OK ) return rc;
+ lsmLogMessage(pDb, 0, "lsmBlockAllocate(): %d freelist: %s", nCall, zFree);
+ lsmFree(pDb->pEnv, zFree);
+ }
+#endif
+
+ /* Set iInUse to the smallest snapshot id that is either:
+ **
+ ** * Currently in use by a database client,
+ ** * May be used by a database client in the future, or
+ ** * Is the most recently checkpointed snapshot (i.e. the one that will
+ ** be used following recovery if a failure occurs at this point).
+ */
+ rc = lsmCheckpointSynced(pDb, &iSynced, 0, 0);
+ if( rc==LSM_OK && iSynced==0 ) iSynced = p->iId;
+ iInUse = iSynced;
+ if( rc==LSM_OK && pDb->iReader>=0 ){
+ assert( pDb->pClient );
+ iInUse = LSM_MIN(iInUse, pDb->pClient->iId);
+ }
+ if( rc==LSM_OK ) rc = firstSnapshotInUse(pDb, &iInUse);
+
+#ifdef LSM_LOG_FREELIST
+ {
+ lsmLogMessage(pDb, 0, "lsmBlockAllocate(): "
+ "snapshot-in-use: %lld (iSynced=%lld) (client-id=%lld)",
+ iInUse, iSynced, (pDb->iReader>=0 ? pDb->pClient->iId : 0)
+ );
+ }
+#endif
+
+
+ /* Unless there exists a read-only transaction (which prevents us from
+ ** recycling any blocks regardless, query the free block list for a
+ ** suitable block to reuse.
+ **
+ ** It might seem more natural to check for a read-only transaction at
+ ** the start of this function. However, it is better do wait until after
+ ** the call to lsmCheckpointSynced() to do so.
+ */
+ if( rc==LSM_OK ){
+ int bRotrans;
+ rc = lsmDetectRoTrans(pDb, &bRotrans);
+
+ if( rc==LSM_OK && bRotrans==0 ){
+ rc = findFreeblock(pDb, iInUse, (iBefore>0), &iRet);
+ }
+ }
+
+ if( iBefore>0 && (iRet<=0 || iRet>=iBefore) ){
+ iRet = 0;
+
+ }else if( rc==LSM_OK ){
+ /* If a block was found in the free block list, use it and remove it from
+ ** the list. Otherwise, if no suitable block was found, allocate one from
+ ** the end of the file. */
+ if( iRet>0 ){
+#ifdef LSM_LOG_FREELIST
+ lsmLogMessage(pDb, 0,
+ "reusing block %d (snapshot-in-use=%lld)", iRet, iInUse);
+#endif
+ rc = freelistAppend(pDb, iRet, -1);
+ if( rc==LSM_OK ){
+ rc = dbTruncate(pDb, iInUse);
+ }
+ }else{
+ iRet = ++(p->nBlock);
+#ifdef LSM_LOG_FREELIST
+ lsmLogMessage(pDb, 0, "extending file to %d blocks", iRet);
+#endif
+ }
+ }
+
+ assert( iBefore>0 || iRet>0 || rc!=LSM_OK );
+ *piBlk = iRet;
+ return rc;
+}
+
+/*
+** Free a database block. The worker snapshot must be held in order to call
+** this function.
+**
+** If successful, LSM_OK is returned. Otherwise, an lsm error code (e.g.
+** LSM_NOMEM).
+*/
+int lsmBlockFree(lsm_db *pDb, int iBlk){
+ Snapshot *p = pDb->pWorker;
+ assert( lsmShmAssertWorker(pDb) );
+
+#ifdef LSM_LOG_FREELIST
+ lsmLogMessage(pDb, LSM_OK, "lsmBlockFree(): Free block %d", iBlk);
+#endif
+
+ return freelistAppend(pDb, iBlk, p->iId);
+}
+
+/*
+** Refree a database block. The worker snapshot must be held in order to call
+** this function.
+**
+** Refreeing is required when a block is allocated using lsmBlockAllocate()
+** but then not used. This function is used to push the block back onto
+** the freelist. Refreeing a block is different from freeing is, as a refreed
+** block may be reused immediately. Whereas a freed block can not be reused
+** until (at least) after the next checkpoint.
+*/
+int lsmBlockRefree(lsm_db *pDb, int iBlk){
+ int rc = LSM_OK; /* Return code */
+
+#ifdef LSM_LOG_FREELIST
+ lsmLogMessage(pDb, LSM_OK, "lsmBlockRefree(): Refree block %d", iBlk);
+#endif
+
+ rc = freelistAppend(pDb, iBlk, 0);
+ return rc;
+}
+
+/*
+** If required, copy a database checkpoint from shared memory into the
+** database itself.
+**
+** The WORKER lock must not be held when this is called. This is because
+** this function may indirectly call fsync(). And the WORKER lock should
+** not be held that long (in case it is required by a client flushing an
+** in-memory tree to disk).
+*/
+int lsmCheckpointWrite(lsm_db *pDb, u32 *pnWrite){
+ int rc; /* Return Code */
+ u32 nWrite = 0;
+
+ assert( pDb->pWorker==0 );
+ assert( 1 || pDb->pClient==0 );
+ assert( lsmShmAssertLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK) );
+
+ rc = lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_EXCL, 0);
+ if( rc!=LSM_OK ) return rc;
+
+ rc = lsmCheckpointLoad(pDb, 0);
+ if( rc==LSM_OK ){
+ int nBlock = lsmCheckpointNBlock(pDb->aSnapshot);
+ ShmHeader *pShm = pDb->pShmhdr;
+ int bDone = 0; /* True if checkpoint is already stored */
+
+ /* Check if this checkpoint has already been written to the database
+ ** file. If so, set variable bDone to true. */
+ if( pShm->iMetaPage ){
+ MetaPage *pPg; /* Meta page */
+ u8 *aData; /* Meta-page data buffer */
+ int nData; /* Size of aData[] in bytes */
+ i64 iCkpt; /* Id of checkpoint just loaded */
+ i64 iDisk = 0; /* Id of checkpoint already stored in db */
+ iCkpt = lsmCheckpointId(pDb->aSnapshot, 0);
+ rc = lsmFsMetaPageGet(pDb->pFS, 0, pShm->iMetaPage, &pPg);
+ if( rc==LSM_OK ){
+ aData = lsmFsMetaPageData(pPg, &nData);
+ iDisk = lsmCheckpointId((u32 *)aData, 1);
+ nWrite = lsmCheckpointNWrite((u32 *)aData, 1);
+ lsmFsMetaPageRelease(pPg);
+ }
+ bDone = (iDisk>=iCkpt);
+ }
+
+ if( rc==LSM_OK && bDone==0 ){
+ int iMeta = (pShm->iMetaPage % 2) + 1;
+ if( pDb->eSafety!=LSM_SAFETY_OFF ){
+ rc = lsmFsSyncDb(pDb->pFS, nBlock);
+ }
+ if( rc==LSM_OK ) rc = lsmCheckpointStore(pDb, iMeta);
+ if( rc==LSM_OK && pDb->eSafety!=LSM_SAFETY_OFF){
+ rc = lsmFsSyncDb(pDb->pFS, 0);
+ }
+ if( rc==LSM_OK ){
+ pShm->iMetaPage = iMeta;
+ nWrite = lsmCheckpointNWrite(pDb->aSnapshot, 0) - nWrite;
+ }
+#ifdef LSM_LOG_WORK
+ lsmLogMessage(pDb, 0, "finish checkpoint %d",
+ (int)lsmCheckpointId(pDb->aSnapshot, 0)
+ );
+#endif
+ }
+ }
+
+ lsmShmLock(pDb, LSM_LOCK_CHECKPOINTER, LSM_LOCK_UNLOCK, 0);
+ if( pnWrite && rc==LSM_OK ) *pnWrite = nWrite;
+ return rc;
+}
+
+int lsmBeginWork(lsm_db *pDb){
+ int rc;
+
+ /* Attempt to take the WORKER lock */
+ rc = lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_EXCL, 0);
+
+ /* Deserialize the current worker snapshot */
+ if( rc==LSM_OK ){
+ rc = lsmCheckpointLoadWorker(pDb);
+ }
+ return rc;
+}
+
+void lsmFreeSnapshot(lsm_env *pEnv, Snapshot *p){
+ if( p ){
+ lsmSortedFreeLevel(pEnv, p->pLevel);
+ lsmFree(pEnv, p->freelist.aEntry);
+ lsmFree(pEnv, p->redirect.a);
+ lsmFree(pEnv, p);
+ }
+}
+
+/*
+** Attempt to populate one of the read-lock slots to contain lock values
+** iLsm/iShm. Or, if such a slot exists already, this function is a no-op.
+**
+** It is not an error if no slot can be populated because the write-lock
+** cannot be obtained. If any other error occurs, return an LSM error code.
+** Otherwise, LSM_OK.
+**
+** This function is called at various points to try to ensure that there
+** always exists at least one read-lock slot that can be used by a read-only
+** client. And so that, in the usual case, there is an "exact match" available
+** whenever a read transaction is opened by any client. At present this
+** function is called when:
+**
+** * A write transaction that called lsmTreeDiscardOld() is committed, and
+** * Whenever the working snapshot is updated (i.e. lsmFinishWork()).
+*/
+static int dbSetReadLock(lsm_db *db, i64 iLsm, u32 iShm){
+ int rc = LSM_OK;
+ ShmHeader *pShm = db->pShmhdr;
+ int i;
+
+ /* Check if there is already a slot containing the required values. */
+ for(i=0; iaReader[i];
+ if( p->iLsmId==iLsm && p->iTreeId==iShm ) return LSM_OK;
+ }
+
+ /* Iterate through all read-lock slots, attempting to take a write-lock
+ ** on each of them. If a write-lock succeeds, populate the locked slot
+ ** with the required values and break out of the loop. */
+ for(i=0; rc==LSM_OK && iaReader[i];
+ p->iLsmId = iLsm;
+ p->iTreeId = iShm;
+ lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0);
+ break;
+ }
+ }
+
+ return rc;
+}
+
+/*
+** Release the read-lock currently held by connection db.
+*/
+int dbReleaseReadlock(lsm_db *db){
+ int rc = LSM_OK;
+ if( db->iReader>=0 ){
+ rc = lsmShmLock(db, LSM_LOCK_READER(db->iReader), LSM_LOCK_UNLOCK, 0);
+ db->iReader = -1;
+ }
+ db->bRoTrans = 0;
+ return rc;
+}
+
+
+/*
+** Argument bFlush is true if the contents of the in-memory tree has just
+** been flushed to disk. The significance of this is that once the snapshot
+** created to hold the updated state of the database is synced to disk, log
+** file space can be recycled.
+*/
+void lsmFinishWork(lsm_db *pDb, int bFlush, int *pRc){
+ int rc = *pRc;
+ assert( rc!=0 || pDb->pWorker );
+ if( pDb->pWorker ){
+ /* If no error has occurred, serialize the worker snapshot and write
+ ** it to shared memory. */
+ if( rc==LSM_OK ){
+ rc = lsmSaveWorker(pDb, bFlush);
+ }
+
+ /* Assuming no error has occurred, update a read lock slot with the
+ ** new snapshot id (see comments above function dbSetReadLock()). */
+ if( rc==LSM_OK ){
+ if( pDb->iReader<0 ){
+ rc = lsmTreeLoadHeader(pDb, 0);
+ }
+ if( rc==LSM_OK ){
+ rc = dbSetReadLock(pDb, pDb->pWorker->iId, pDb->treehdr.iUsedShmid);
+ }
+ }
+
+ /* Free the snapshot object. */
+ lsmFreeSnapshot(pDb->pEnv, pDb->pWorker);
+ pDb->pWorker = 0;
+ }
+
+ lsmShmLock(pDb, LSM_LOCK_WORKER, LSM_LOCK_UNLOCK, 0);
+ *pRc = rc;
+}
+
+/*
+** Called when recovery is finished.
+*/
+int lsmFinishRecovery(lsm_db *pDb){
+ lsmTreeEndTransaction(pDb, 1);
+ return LSM_OK;
+}
+
+/*
+** Check if the currently configured compression functions
+** (LSM_CONFIG_SET_COMPRESSION) are compatible with a database that has its
+** compression id set to iReq. Compression routines are compatible if iReq
+** is zero (indicating the database is empty), or if it is equal to the
+** compression id of the configured compression routines.
+**
+** If the check shows that the current compression are incompatible and there
+** is a compression factory registered, give it a chance to install new
+** compression routines.
+**
+** If, after any registered factory is invoked, the compression functions
+** are still incompatible, return LSM_MISMATCH. Otherwise, LSM_OK.
+*/
+int lsmCheckCompressionId(lsm_db *pDb, u32 iReq){
+ if( iReq!=LSM_COMPRESSION_EMPTY && pDb->compress.iId!=iReq ){
+ if( pDb->factory.xFactory ){
+ pDb->bInFactory = 1;
+ pDb->factory.xFactory(pDb->factory.pCtx, pDb, iReq);
+ pDb->bInFactory = 0;
+ }
+ if( pDb->compress.iId!=iReq ){
+ /* Incompatible */
+ return LSM_MISMATCH;
+ }
+ }
+ /* Compatible */
+ return LSM_OK;
+}
+
+/*
+** Begin a read transaction. This function is a no-op if the connection
+** passed as the only argument already has an open read transaction.
+*/
+int lsmBeginReadTrans(lsm_db *pDb){
+ const int MAX_READLOCK_ATTEMPTS = 10;
+ const int nMaxAttempt = (pDb->bRoTrans ? 1 : MAX_READLOCK_ATTEMPTS);
+
+ int rc = LSM_OK; /* Return code */
+ int iAttempt = 0;
+
+ assert( pDb->pWorker==0 );
+
+ while( rc==LSM_OK && pDb->iReader<0 && (iAttempt++)pCsr==0 && pDb->nTransOpen==0 );
+
+ /* Load the in-memory tree header. */
+ rc = lsmTreeLoadHeader(pDb, &iTreehdr);
+
+ /* Load the database snapshot */
+ if( rc==LSM_OK ){
+ if( lsmCheckpointClientCacheOk(pDb)==0 ){
+ lsmFreeSnapshot(pDb->pEnv, pDb->pClient);
+ pDb->pClient = 0;
+ lsmMCursorFreeCache(pDb);
+ lsmFsPurgeCache(pDb->pFS);
+ rc = lsmCheckpointLoad(pDb, &iSnap);
+ }else{
+ iSnap = 1;
+ }
+ }
+
+ /* Take a read-lock on the tree and snapshot just loaded. Then check
+ ** that the shared-memory still contains the same values. If so, proceed.
+ ** Otherwise, relinquish the read-lock and retry the whole procedure
+ ** (starting with loading the in-memory tree header). */
+ if( rc==LSM_OK ){
+ u32 iShmMax = pDb->treehdr.iUsedShmid;
+ u32 iShmMin = pDb->treehdr.iNextShmid+1-LSM_MAX_SHMCHUNKS;
+ rc = lsmReadlock(
+ pDb, lsmCheckpointId(pDb->aSnapshot, 0), iShmMin, iShmMax
+ );
+ if( rc==LSM_OK ){
+ if( lsmTreeLoadHeaderOk(pDb, iTreehdr)
+ && lsmCheckpointLoadOk(pDb, iSnap)
+ ){
+ /* Read lock has been successfully obtained. Deserialize the
+ ** checkpoint just loaded. TODO: This will be removed after
+ ** lsm_sorted.c is changed to work directly from the serialized
+ ** version of the snapshot. */
+ if( pDb->pClient==0 ){
+ rc = lsmCheckpointDeserialize(pDb, 0, pDb->aSnapshot,&pDb->pClient);
+ }
+ assert( (rc==LSM_OK)==(pDb->pClient!=0) );
+ assert( pDb->iReader>=0 );
+
+ /* Check that the client has the right compression hooks loaded.
+ ** If not, set rc to LSM_MISMATCH. */
+ if( rc==LSM_OK ){
+ rc = lsmCheckCompressionId(pDb, pDb->pClient->iCmpId);
+ }
+ }else{
+ rc = dbReleaseReadlock(pDb);
+ }
+ }
+
+ if( rc==LSM_BUSY ){
+ rc = LSM_OK;
+ }
+ }
+#if 0
+if( rc==LSM_OK && pDb->pClient ){
+ fprintf(stderr,
+ "reading %p: snapshot:%d used-shmid:%d trans-id:%d iOldShmid=%d\n",
+ (void *)pDb,
+ (int)pDb->pClient->iId, (int)pDb->treehdr.iUsedShmid,
+ (int)pDb->treehdr.root.iTransId,
+ (int)pDb->treehdr.iOldShmid
+ );
+}
+#endif
+ }
+
+ if( rc==LSM_OK ){
+ rc = lsmShmCacheChunks(pDb, pDb->treehdr.nChunk);
+ }
+ if( rc!=LSM_OK ){
+ dbReleaseReadlock(pDb);
+ }
+ if( pDb->pClient==0 && rc==LSM_OK ) rc = LSM_BUSY;
+ return rc;
+}
+
+/*
+** This function is used by a read-write connection to determine if there
+** are currently one or more read-only transactions open on the database
+** (in this context a read-only transaction is one opened by a read-only
+** connection on a non-live database).
+**
+** If no error occurs, LSM_OK is returned and *pbExists is set to true if
+** some other connection has a read-only transaction open, or false
+** otherwise. If an error occurs an LSM error code is returned and the final
+** value of *pbExist is undefined.
+*/
+int lsmDetectRoTrans(lsm_db *db, int *pbExist){
+ int rc;
+
+ /* Only a read-write connection may use this function. */
+ assert( db->bReadonly==0 );
+
+ rc = lsmShmTestLock(db, LSM_LOCK_ROTRANS, 1, LSM_LOCK_EXCL);
+ if( rc==LSM_BUSY ){
+ *pbExist = 1;
+ rc = LSM_OK;
+ }else{
+ *pbExist = 0;
+ }
+
+ return rc;
+}
+
+/*
+** db is a read-only database handle in the disconnected state. This function
+** attempts to open a read-transaction on the database. This may involve
+** connecting to the database system (opening shared memory etc.).
+*/
+int lsmBeginRoTrans(lsm_db *db){
+ int rc = LSM_OK;
+
+ assert( db->bReadonly && db->pShmhdr==0 );
+ assert( db->iReader<0 );
+
+ if( db->bRoTrans==0 ){
+
+ /* Attempt a shared-lock on DMS1. */
+ rc = lsmShmLock(db, LSM_LOCK_DMS1, LSM_LOCK_SHARED, 0);
+ if( rc!=LSM_OK ) return rc;
+
+ rc = lsmShmTestLock(
+ db, LSM_LOCK_RWCLIENT(0), LSM_LOCK_NREADER, LSM_LOCK_SHARED
+ );
+ if( rc==LSM_OK ){
+ /* System is not live. Take a SHARED lock on the ROTRANS byte and
+ ** release DMS1. Locking ROTRANS tells all read-write clients that they
+ ** may not recycle any disk space from within the database or log files,
+ ** as a read-only client may be using it. */
+ rc = lsmShmLock(db, LSM_LOCK_ROTRANS, LSM_LOCK_SHARED, 0);
+ lsmShmLock(db, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
+
+ if( rc==LSM_OK ){
+ db->bRoTrans = 1;
+ rc = lsmShmCacheChunks(db, 1);
+ if( rc==LSM_OK ){
+ db->pShmhdr = (ShmHeader *)db->apShm[0];
+ memset(db->pShmhdr, 0, sizeof(ShmHeader));
+ rc = lsmCheckpointRecover(db);
+ if( rc==LSM_OK ){
+ rc = lsmLogRecover(db);
+ }
+ }
+ }
+ }else if( rc==LSM_BUSY ){
+ /* System is live! */
+ rc = lsmShmLock(db, LSM_LOCK_DMS3, LSM_LOCK_SHARED, 0);
+ lsmShmLock(db, LSM_LOCK_DMS1, LSM_LOCK_UNLOCK, 0);
+ if( rc==LSM_OK ){
+ rc = lsmShmCacheChunks(db, 1);
+ if( rc==LSM_OK ){
+ db->pShmhdr = (ShmHeader *)db->apShm[0];
+ }
+ }
+ }
+
+ if( rc==LSM_OK ){
+ rc = lsmBeginReadTrans(db);
+ }
+ }
+
+ return rc;
+}
+
+/*
+** Close the currently open read transaction.
+*/
+void lsmFinishReadTrans(lsm_db *pDb){
+
+ /* Worker connections should not be closing read transactions. And
+ ** read transactions should only be closed after all cursors and write
+ ** transactions have been closed. Finally pClient should be non-NULL
+ ** only iff pDb->iReader>=0. */
+ assert( pDb->pWorker==0 );
+ assert( pDb->pCsr==0 && pDb->nTransOpen==0 );
+
+ if( pDb->bRoTrans ){
+ int i;
+ for(i=0; inShm; i++){
+ lsmFree(pDb->pEnv, pDb->apShm[i]);
+ }
+ lsmFree(pDb->pEnv, pDb->apShm);
+ pDb->apShm = 0;
+ pDb->nShm = 0;
+ pDb->pShmhdr = 0;
+
+ lsmShmLock(pDb, LSM_LOCK_ROTRANS, LSM_LOCK_UNLOCK, 0);
+ }
+ dbReleaseReadlock(pDb);
+}
+
+/*
+** Open a write transaction.
+*/
+int lsmBeginWriteTrans(lsm_db *pDb){
+ int rc = LSM_OK; /* Return code */
+ ShmHeader *pShm = pDb->pShmhdr; /* Shared memory header */
+
+ assert( pDb->nTransOpen==0 );
+ assert( pDb->bDiscardOld==0 );
+ assert( pDb->bReadonly==0 );
+
+ /* If there is no read-transaction open, open one now. */
+ if( pDb->iReader<0 ){
+ rc = lsmBeginReadTrans(pDb);
+ }
+
+ /* Attempt to take the WRITER lock */
+ if( rc==LSM_OK ){
+ rc = lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_EXCL, 0);
+ }
+
+ /* If the previous writer failed mid-transaction, run emergency rollback. */
+ if( rc==LSM_OK && pShm->bWriter ){
+ rc = lsmTreeRepair(pDb);
+ if( rc==LSM_OK ) pShm->bWriter = 0;
+ }
+
+ /* Check that this connection is currently reading from the most recent
+ ** version of the database. If not, return LSM_BUSY. */
+ if( rc==LSM_OK && memcmp(&pShm->hdr1, &pDb->treehdr, sizeof(TreeHeader)) ){
+ rc = LSM_BUSY;
+ }
+
+ if( rc==LSM_OK ){
+ rc = lsmLogBegin(pDb);
+ }
+
+ /* If everything was successful, set the "transaction-in-progress" flag
+ ** and return LSM_OK. Otherwise, if some error occurred, relinquish the
+ ** WRITER lock and return an error code. */
+ if( rc==LSM_OK ){
+ TreeHeader *p = &pDb->treehdr;
+ pShm->bWriter = 1;
+ p->root.iTransId++;
+ if( lsmTreeHasOld(pDb) && p->iOldLog==pDb->pClient->iLogOff ){
+ lsmTreeDiscardOld(pDb);
+ pDb->bDiscardOld = 1;
+ }
+ }else{
+ lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0);
+ if( pDb->pCsr==0 ) lsmFinishReadTrans(pDb);
+ }
+ return rc;
+}
+
+/*
+** End the current write transaction. The connection is left with an open
+** read transaction. It is an error to call this if there is no open write
+** transaction.
+**
+** If the transaction was committed, then a commit record has already been
+** written into the log file when this function is called. Or, if the
+** transaction was rolled back, both the log file and in-memory tree
+** structure have already been restored. In either case, this function
+** merely releases locks and other resources held by the write-transaction.
+**
+** LSM_OK is returned if successful, or an LSM error code otherwise.
+*/
+int lsmFinishWriteTrans(lsm_db *pDb, int bCommit){
+ int rc = LSM_OK;
+ int bFlush = 0;
+
+ lsmLogEnd(pDb, bCommit);
+ if( rc==LSM_OK && bCommit && lsmTreeSize(pDb)>pDb->nTreeLimit ){
+ bFlush = 1;
+ lsmTreeMakeOld(pDb);
+ }
+ lsmTreeEndTransaction(pDb, bCommit);
+
+ if( rc==LSM_OK ){
+ if( bFlush && pDb->bAutowork ){
+ rc = lsmSortedAutoWork(pDb, 1);
+ }else if( bCommit && pDb->bDiscardOld ){
+ rc = dbSetReadLock(pDb, pDb->pClient->iId, pDb->treehdr.iUsedShmid);
+ }
+ }
+ pDb->bDiscardOld = 0;
+ lsmShmLock(pDb, LSM_LOCK_WRITER, LSM_LOCK_UNLOCK, 0);
+
+ if( bFlush && pDb->bAutowork==0 && pDb->xWork ){
+ pDb->xWork(pDb, pDb->pWorkCtx);
+ }
+ return rc;
+}
+
+
+/*
+** Return non-zero if the caller is holding the client mutex.
+*/
+#ifdef LSM_DEBUG
+int lsmHoldingClientMutex(lsm_db *pDb){
+ return lsmMutexHeld(pDb->pEnv, pDb->pDatabase->pClientMutex);
+}
+#endif
+
+static int slotIsUsable(ShmReader *p, i64 iLsm, u32 iShmMin, u32 iShmMax){
+ return(
+ p->iLsmId && p->iLsmId<=iLsm
+ && shm_sequence_ge(iShmMax, p->iTreeId)
+ && shm_sequence_ge(p->iTreeId, iShmMin)
+ );
+}
+
+/*
+** Obtain a read-lock on database version identified by the combination
+** of snapshot iLsm and tree iTree. Return LSM_OK if successful, or
+** an LSM error code otherwise.
+*/
+int lsmReadlock(lsm_db *db, i64 iLsm, u32 iShmMin, u32 iShmMax){
+ int rc = LSM_OK;
+ ShmHeader *pShm = db->pShmhdr;
+ int i;
+
+ assert( db->iReader<0 );
+ assert( shm_sequence_ge(iShmMax, iShmMin) );
+
+ /* This is a no-op if the read-only transaction flag is set. */
+ if( db->bRoTrans ){
+ db->iReader = 0;
+ return LSM_OK;
+ }
+
+ /* Search for an exact match. */
+ for(i=0; db->iReader<0 && rc==LSM_OK && iaReader[i];
+ if( p->iLsmId==iLsm && p->iTreeId==iShmMax ){
+ rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
+ if( rc==LSM_OK && p->iLsmId==iLsm && p->iTreeId==iShmMax ){
+ db->iReader = i;
+ }else if( rc==LSM_BUSY ){
+ rc = LSM_OK;
+ }
+ }
+ }
+
+ /* Try to obtain a write-lock on each slot, in order. If successful, set
+ ** the slot values to iLsm/iTree. */
+ for(i=0; db->iReader<0 && rc==LSM_OK && iaReader[i];
+ p->iLsmId = iLsm;
+ p->iTreeId = iShmMax;
+ rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
+ assert( rc!=LSM_BUSY );
+ if( rc==LSM_OK ) db->iReader = i;
+ }
+ }
+
+ /* Search for any usable slot */
+ for(i=0; db->iReader<0 && rc==LSM_OK && iaReader[i];
+ if( slotIsUsable(p, iLsm, iShmMin, iShmMax) ){
+ rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_SHARED, 0);
+ if( rc==LSM_OK && slotIsUsable(p, iLsm, iShmMin, iShmMax) ){
+ db->iReader = i;
+ }else if( rc==LSM_BUSY ){
+ rc = LSM_OK;
+ }
+ }
+ }
+
+ if( rc==LSM_OK && db->iReader<0 ){
+ rc = LSM_BUSY;
+ }
+ return rc;
+}
+
+/*
+** This is used to check if there exists a read-lock locking a particular
+** version of either the in-memory tree or database file.
+**
+** If iLsmId is non-zero, then it is a snapshot id. If there exists a
+** read-lock using this snapshot or newer, set *pbInUse to true. Or,
+** if there is no such read-lock, set it to false.
+**
+** Or, if iLsmId is zero, then iShmid is a shared-memory sequence id.
+** Search for a read-lock using this sequence id or newer. etc.
+*/
+static int isInUse(lsm_db *db, i64 iLsmId, u32 iShmid, int *pbInUse){
+ ShmHeader *pShm = db->pShmhdr;
+ int i;
+ int rc = LSM_OK;
+
+ for(i=0; rc==LSM_OK && iaReader[i];
+ if( p->iLsmId ){
+ if( (iLsmId!=0 && p->iLsmId!=0 && iLsmId>=p->iLsmId)
+ || (iLsmId==0 && shm_sequence_ge(p->iTreeId, iShmid))
+ ){
+ rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
+ if( rc==LSM_OK ){
+ p->iLsmId = 0;
+ lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0);
+ }
+ }
+ }
+ }
+
+ if( rc==LSM_BUSY ){
+ *pbInUse = 1;
+ return LSM_OK;
+ }
+ *pbInUse = 0;
+ return rc;
+}
+
+/*
+** This function is called by worker connections to determine the smallest
+** snapshot id that is currently in use by a database client. The worker
+** connection uses this result to determine whether or not it is safe to
+** recycle a database block.
+*/
+static int firstSnapshotInUse(
+ lsm_db *db, /* Database handle */
+ i64 *piInUse /* IN/OUT: Smallest snapshot id in use */
+){
+ ShmHeader *pShm = db->pShmhdr;
+ i64 iInUse = *piInUse;
+ int i;
+
+ assert( iInUse>0 );
+ for(i=0; iaReader[i];
+ if( p->iLsmId ){
+ i64 iThis = p->iLsmId;
+ if( iThis!=0 && iInUse>iThis ){
+ int rc = lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_EXCL, 0);
+ if( rc==LSM_OK ){
+ p->iLsmId = 0;
+ lsmShmLock(db, LSM_LOCK_READER(i), LSM_LOCK_UNLOCK, 0);
+ }else if( rc==LSM_BUSY ){
+ iInUse = iThis;
+ }else{
+ /* Some error other than LSM_BUSY. Return the error code to
+ ** the caller in this case. */
+ return rc;
+ }
+ }
+ }
+ }
+
+ *piInUse = iInUse;
+ return LSM_OK;
+}
+
+int lsmTreeInUse(lsm_db *db, u32 iShmid, int *pbInUse){
+ if( db->treehdr.iUsedShmid==iShmid ){
+ *pbInUse = 1;
+ return LSM_OK;
+ }
+ return isInUse(db, 0, iShmid, pbInUse);
+}
+
+int lsmLsmInUse(lsm_db *db, i64 iLsmId, int *pbInUse){
+ if( db->pClient && db->pClient->iId<=iLsmId ){
+ *pbInUse = 1;
+ return LSM_OK;
+ }
+ return isInUse(db, iLsmId, 0, pbInUse);
+}
+
+/*
+** This function may only be called after a successful call to
+** lsmDbDatabaseConnect(). It returns true if the connection is in
+** multi-process mode, or false otherwise.
+*/
+int lsmDbMultiProc(lsm_db *pDb){
+ return pDb->pDatabase && pDb->pDatabase->bMultiProc;
+}
+
+
+/*************************************************************************
+**************************************************************************
+**************************************************************************
+**************************************************************************
+**************************************************************************
+*************************************************************************/
+
+/*
+** Ensure that database connection db has cached pointers to at least the
+** first nChunk chunks of shared memory.
+*/
+int lsmShmCacheChunks(lsm_db *db, int nChunk){
+ int rc = LSM_OK;
+ if( nChunk>db->nShm ){
+ static const int NINCR = 16;
+ Database *p = db->pDatabase;
+ lsm_env *pEnv = db->pEnv;
+ int nAlloc;
+ int i;
+
+ /* Ensure that the db->apShm[] array is large enough. If an attempt to
+ ** allocate memory fails, return LSM_NOMEM immediately. The apShm[] array
+ ** is always extended in multiples of 16 entries - so the actual allocated
+ ** size can be inferred from nShm. */
+ nAlloc = ((db->nShm + NINCR - 1) / NINCR) * NINCR;
+ while( nChunk>=nAlloc ){
+ void **apShm;
+ nAlloc += NINCR;
+ apShm = lsmRealloc(pEnv, db->apShm, sizeof(void*)*nAlloc);
+ if( !apShm ) return LSM_NOMEM_BKPT;
+ db->apShm = apShm;
+ }
+
+ if( db->bRoTrans ){
+ for(i=db->nShm; rc==LSM_OK && iapShm[i] = lsmMallocZeroRc(pEnv, LSM_SHM_CHUNK_SIZE, &rc);
+ db->nShm++;
+ }
+
+ }else{
+
+ /* Enter the client mutex */
+ lsmMutexEnter(pEnv, p->pClientMutex);
+
+ /* Extend the Database objects apShmChunk[] array if necessary. Using the
+ ** same pattern as for the lsm_db.apShm[] array above. */
+ nAlloc = ((p->nShmChunk + NINCR - 1) / NINCR) * NINCR;
+ while( nChunk>=nAlloc ){
+ void **apShm;
+ nAlloc += NINCR;
+ apShm = lsmRealloc(pEnv, p->apShmChunk, sizeof(void*)*nAlloc);
+ if( !apShm ){
+ rc = LSM_NOMEM_BKPT;
+ break;
+ }
+ p->apShmChunk = apShm;
+ }
+
+ for(i=db->nShm; rc==LSM_OK && i=p->nShmChunk ){
+ void *pChunk = 0;
+ if( p->bMultiProc==0 ){
+ /* Single process mode */
+ pChunk = lsmMallocZeroRc(pEnv, LSM_SHM_CHUNK_SIZE, &rc);
+ }else{
+ /* Multi-process mode */
+ rc = lsmEnvShmMap(pEnv, p->pFile, i, LSM_SHM_CHUNK_SIZE, &pChunk);
+ }
+ if( rc==LSM_OK ){
+ p->apShmChunk[i] = pChunk;
+ p->nShmChunk++;
+ }
+ }
+ if( rc==LSM_OK ){
+ db->apShm[i] = p->apShmChunk[i];
+ db->nShm++;
+ }
+ }
+
+ /* Release the client mutex */
+ lsmMutexLeave(pEnv, p->pClientMutex);
+ }
+ }
+
+ return rc;
+}
+
+static int lockSharedFile(lsm_env *pEnv, Database *p, int iLock, int eOp){
+ int rc = LSM_OK;
+ if( p->bMultiProc ){
+ rc = lsmEnvLock(pEnv, p->pFile, iLock, eOp);
+ }
+ return rc;
+}
+
+/*
+** Test if it would be possible for connection db to obtain a lock of type
+** eType on the nLock locks starting at iLock. If so, return LSM_OK. If it
+** would not be possible to obtain the lock due to a lock held by another
+** connection, return LSM_BUSY. If an IO or other error occurs (i.e. in the
+** lsm_env.xTestLock function), return some other LSM error code.
+**
+** Note that this function never actually locks the database - it merely
+** queries the system to see if there exists a lock that would prevent
+** it from doing so.
+*/
+int lsmShmTestLock(
+ lsm_db *db,
+ int iLock,
+ int nLock,
+ int eOp
+){
+ int rc = LSM_OK;
+ lsm_db *pIter;
+ Database *p = db->pDatabase;
+ int i;
+ u64 mask = 0;
+
+ for(i=iLock; i<(iLock+nLock); i++){
+ mask |= ((u64)1 << (iLock-1));
+ if( eOp==LSM_LOCK_EXCL ) mask |= ((u64)1 << (iLock+32-1));
+ }
+
+ lsmMutexEnter(db->pEnv, p->pClientMutex);
+ for(pIter=p->pConn; pIter; pIter=pIter->pNext){
+ if( pIter!=db && (pIter->mLock & mask) ){
+ assert( pIter!=db );
+ break;
+ }
+ }
+
+ if( pIter ){
+ rc = LSM_BUSY;
+ }else if( p->bMultiProc ){
+ rc = lsmEnvTestLock(db->pEnv, p->pFile, iLock, nLock, eOp);
+ }
+
+ lsmMutexLeave(db->pEnv, p->pClientMutex);
+ return rc;
+}
+
+/*
+** Attempt to obtain the lock identified by the iLock and bExcl parameters.
+** If successful, return LSM_OK. If the lock cannot be obtained because
+** there exists some other conflicting lock, return LSM_BUSY. If some other
+** error occurs, return an LSM error code.
+**
+** Parameter iLock must be one of LSM_LOCK_WRITER, WORKER or CHECKPOINTER,
+** or else a value returned by the LSM_LOCK_READER macro.
+*/
+int lsmShmLock(
+ lsm_db *db,
+ int iLock,
+ int eOp, /* One of LSM_LOCK_UNLOCK, SHARED or EXCL */
+ int bBlock /* True for a blocking lock */
+){
+ lsm_db *pIter;
+ const u64 me = ((u64)1 << (iLock-1));
+ const u64 ms = ((u64)1 << (iLock+32-1));
+ int rc = LSM_OK;
+ Database *p = db->pDatabase;
+
+ assert( eOp!=LSM_LOCK_EXCL || p->bReadonly==0 );
+ assert( iLock>=1 && iLock<=LSM_LOCK_RWCLIENT(LSM_LOCK_NRWCLIENT-1) );
+ assert( LSM_LOCK_RWCLIENT(LSM_LOCK_NRWCLIENT-1)<=32 );
+ assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL );
+
+ /* Check for a no-op. Proceed only if this is not one of those. */
+ if( (eOp==LSM_LOCK_UNLOCK && (db->mLock & (me|ms))!=0)
+ || (eOp==LSM_LOCK_SHARED && (db->mLock & (me|ms))!=ms)
+ || (eOp==LSM_LOCK_EXCL && (db->mLock & me)==0)
+ ){
+ int nExcl = 0; /* Number of connections holding EXCLUSIVE */
+ int nShared = 0; /* Number of connections holding SHARED */
+ lsmMutexEnter(db->pEnv, p->pClientMutex);
+
+ /* Figure out the locks currently held by this process on iLock, not
+ ** including any held by connection db. */
+ for(pIter=p->pConn; pIter; pIter=pIter->pNext){
+ assert( (pIter->mLock & me)==0 || (pIter->mLock & ms)!=0 );
+ if( pIter!=db ){
+ if( pIter->mLock & me ){
+ nExcl++;
+ }else if( pIter->mLock & ms ){
+ nShared++;
+ }
+ }
+ }
+ assert( nExcl==0 || nExcl==1 );
+ assert( nExcl==0 || nShared==0 );
+ assert( nExcl==0 || (db->mLock & (me|ms))==0 );
+
+ switch( eOp ){
+ case LSM_LOCK_UNLOCK:
+ if( nShared==0 ){
+ lockSharedFile(db->pEnv, p, iLock, LSM_LOCK_UNLOCK);
+ }
+ db->mLock &= ~(me|ms);
+ break;
+
+ case LSM_LOCK_SHARED:
+ if( nExcl ){
+ rc = LSM_BUSY;
+ }else{
+ if( nShared==0 ){
+ rc = lockSharedFile(db->pEnv, p, iLock, LSM_LOCK_SHARED);
+ }
+ if( rc==LSM_OK ){
+ db->mLock |= ms;
+ db->mLock &= ~me;
+ }
+ }
+ break;
+
+ default:
+ assert( eOp==LSM_LOCK_EXCL );
+ if( nExcl || nShared ){
+ rc = LSM_BUSY;
+ }else{
+ rc = lockSharedFile(db->pEnv, p, iLock, LSM_LOCK_EXCL);
+ if( rc==LSM_OK ){
+ db->mLock |= (me|ms);
+ }
+ }
+ break;
+ }
+
+ lsmMutexLeave(db->pEnv, p->pClientMutex);
+ }
+
+ return rc;
+}
+
+#ifdef LSM_DEBUG
+
+int shmLockType(lsm_db *db, int iLock){
+ const u64 me = ((u64)1 << (iLock-1));
+ const u64 ms = ((u64)1 << (iLock+32-1));
+
+ if( db->mLock & me ) return LSM_LOCK_EXCL;
+ if( db->mLock & ms ) return LSM_LOCK_SHARED;
+ return LSM_LOCK_UNLOCK;
+}
+
+/*
+** The arguments passed to this function are similar to those passed to
+** the lsmShmLock() function. However, instead of obtaining a new lock
+** this function returns true if the specified connection already holds
+** (or does not hold) such a lock, depending on the value of eOp. As
+** follows:
+**
+** (eOp==LSM_LOCK_UNLOCK) -> true if db has no lock on iLock
+** (eOp==LSM_LOCK_SHARED) -> true if db has at least a SHARED lock on iLock.
+** (eOp==LSM_LOCK_EXCL) -> true if db has an EXCLUSIVE lock on iLock.
+*/
+int lsmShmAssertLock(lsm_db *db, int iLock, int eOp){
+ int ret = 0;
+ int eHave;
+
+ assert( iLock>=1 && iLock<=LSM_LOCK_READER(LSM_LOCK_NREADER-1) );
+ assert( iLock<=16 );
+ assert( eOp==LSM_LOCK_UNLOCK || eOp==LSM_LOCK_SHARED || eOp==LSM_LOCK_EXCL );
+
+ eHave = shmLockType(db, iLock);
+
+ switch( eOp ){
+ case LSM_LOCK_UNLOCK:
+ ret = (eHave==LSM_LOCK_UNLOCK);
+ break;
+ case LSM_LOCK_SHARED:
+ ret = (eHave!=LSM_LOCK_UNLOCK);
+ break;
+ case LSM_LOCK_EXCL:
+ ret = (eHave==LSM_LOCK_EXCL);
+ break;
+ default:
+ assert( !"bad eOp value passed to lsmShmAssertLock()" );
+ break;
+ }
+
+ return ret;
+}
+
+int lsmShmAssertWorker(lsm_db *db){
+ return lsmShmAssertLock(db, LSM_LOCK_WORKER, LSM_LOCK_EXCL) && db->pWorker;
+}
+
+/*
+** This function does not contribute to library functionality, and is not
+** included in release builds. It is intended to be called from within
+** an interactive debugger.
+**
+** When called, this function prints a single line of human readable output
+** to stdout describing the locks currently held by the connection. For
+** example:
+**
+** (gdb) call print_db_locks(pDb)
+** (shared on dms2) (exclusive on writer)
+*/
+void print_db_locks(lsm_db *db){
+ int iLock;
+ for(iLock=0; iLock<16; iLock++){
+ int bOne = 0;
+ const char *azLock[] = {0, "shared", "exclusive"};
+ const char *azName[] = {
+ 0, "dms1", "dms2", "writer", "worker", "checkpointer",
+ "reader0", "reader1", "reader2", "reader3", "reader4", "reader5"
+ };
+ int eHave = shmLockType(db, iLock);
+ if( azLock[eHave] ){
+ printf("%s(%s on %s)", (bOne?" ":""), azLock[eHave], azName[iLock]);
+ bOne = 1;
+ }
+ }
+ printf("\n");
+}
+void print_all_db_locks(lsm_db *db){
+ lsm_db *p;
+ for(p=db->pDatabase->pConn; p; p=p->pNext){
+ printf("%s connection %p ", ((p==db)?"*":""), p);
+ print_db_locks(p);
+ }
+}
+#endif
+
+void lsmShmBarrier(lsm_db *db){
+ lsmEnvShmBarrier(db->pEnv);
+}
+
+int lsm_checkpoint(lsm_db *pDb, int *pnKB){
+ int rc; /* Return code */
+ u32 nWrite = 0; /* Number of pages checkpointed */
+
+ /* Attempt the checkpoint. If successful, nWrite is set to the number of
+ ** pages written between this and the previous checkpoint. */
+ rc = lsmCheckpointWrite(pDb, &nWrite);
+
+ /* If required, calculate the output variable (KB of data checkpointed).
+ ** Set it to zero if an error occured. */
+ if( pnKB ){
+ int nKB = 0;
+ if( rc==LSM_OK && nWrite ){
+ nKB = (((i64)nWrite * lsmFsPageSize(pDb->pFS)) + 1023) / 1024;
+ }
+ *pnKB = nKB;
+ }
+
+ return rc;
+}
diff --git a/ext/lsm1/lsm_sorted.c b/ext/lsm1/lsm_sorted.c
new file mode 100644
index 0000000..f479f4c
--- /dev/null
+++ b/ext/lsm1/lsm_sorted.c
@@ -0,0 +1,6170 @@
+/*
+** 2011-08-14
+**
+** The author disclaims copyright to this source code. In place of
+** a legal notice, here is a blessing:
+**
+** May you do good and not evil.
+** May you find forgiveness for yourself and forgive others.
+** May you share freely, never taking more than you give.
+**
+*************************************************************************
+**
+** PAGE FORMAT:
+**
+** The maximum page size is 65536 bytes.
+**
+** Since all records are equal to or larger than 2 bytes in size, and
+** some space within the page is consumed by the page footer, there must
+** be less than 2^15 records on each page.
+**
+** Each page ends with a footer that describes the pages contents. This
+** footer serves as similar purpose to the page header in an SQLite database.
+** A footer is used instead of a header because it makes it easier to
+** populate a new page based on a sorted list of key/value pairs.
+**
+** The footer consists of the following values (starting at the end of
+** the page and continuing backwards towards the start). All values are
+** stored as unsigned big-endian integers.
+**
+** * Number of records on page (2 bytes).
+** * Flags field (2 bytes).
+** * Left-hand pointer value (8 bytes).
+** * The starting offset of each record (2 bytes per record).
+**
+** Records may span pages. Unless it happens to be an exact fit, the part
+** of the final record that starts on page X that does not fit on page X
+** is stored at the start of page (X+1). This means there may be pages where
+** (N==0). And on most pages the first record that starts on the page will
+** not start at byte offset 0. For example:
+**
+** aaaaa bbbbb ccc