diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..e86f2562 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +bcache-super-show +make-bcache +probe-bcache +.* +*.o diff --git a/61-bcache.rules b/61-bcache.rules new file mode 100644 index 00000000..dd85e69a --- /dev/null +++ b/61-bcache.rules @@ -0,0 +1,25 @@ +# register bcache devices as they come up +# man 7 udev for syntax + +SUBSYSTEM!="block", GOTO="bcache_end" +ACTION=="remove", GOTO="bcache_end" + +# Backing devices: scan, symlink, register +IMPORT{program}="/sbin/blkid -o udev $tempnode" +# blkid and probe-bcache can disagree, in which case don't register +ENV{ID_FS_TYPE}=="?*", ENV{ID_FS_TYPE}!="bcache", GOTO="bcache_backing_end" + +IMPORT{program}="/sbin/probe-bcache -o udev $tempnode" +ENV{ID_FS_UUID_ENC}=="?*", SYMLINK+="disk/by-uuid/$env{ID_FS_UUID_ENC}" +SUBSYSTEM=="block", ACTION=="add|change", ENV{ID_FS_TYPE}=="bcache", \ + RUN+="bcache-register $tempnode" +LABEL="bcache_backing_end" + +# Cached devices: symlink +DRIVER=="bcache", ENV{CACHED_UUID}=="?*", \ + SYMLINK+="bcache/by-uuid/$env{CACHED_UUID}" +DRIVER=="bcache", ENV{CACHED_LABEL}=="?*", \ + SYMLINK+="bcache/by-label/$env{CACHED_LABEL}" + +LABEL="bcache_end" + diff --git a/COPYING b/COPYING new file mode 100644 index 00000000..3912109b --- /dev/null +++ b/COPYING @@ -0,0 +1,340 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc. + 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Library General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program; if not, write to the Free Software + Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Library General +Public License instead of this License. diff --git a/Makefile b/Makefile new file mode 100644 index 00000000..10ec79c6 --- /dev/null +++ b/Makefile @@ -0,0 +1,24 @@ + +PREFIX=/usr +CFLAGS+=-O2 -Wall -g + +all: make-bcache probe-bcache bcache-super-show + +install: make-bcache probe-bcache bcache-super-show + install -m0755 make-bcache bcache-super-show $(DESTDIR)${PREFIX}/sbin/ + install -m0755 probe-bcache $(DESTDIR)/sbin/ + install -m0644 61-bcache.rules $(DESTDIR)/lib/udev/rules.d/ + install -m0755 bcache-register $(DESTDIR)/lib/udev/ + -install -m0755 initramfs/hook $(DESTDIR)/etc/initramfs-tools/hooks/bcache + install -m0644 -- *.8 $(DESTDIR)${PREFIX}/share/man/man8 +# install -m0755 bcache-test $(DESTDIR)${PREFIX}/sbin/ + +clean: + $(RM) -f make-bcache probe-bcache bcache-super-show bcache-test *.o + +bcache-test: LDLIBS += -lm -lssl -lcrypto +make-bcache: LDLIBS += -luuid +make-bcache: bcache.o +probe-bcache: LDLIBS += -luuid +bcache-super-show: LDLIBS += -luuid +bcache-super-show: bcache.o diff --git a/README b/README index 3e4ac29b..4a13db68 100644 --- a/README +++ b/README @@ -1 +1,27 @@ -Please add some description of what this package does \ No newline at end of file +These are the userspace tools required for bcache. + +Bcache is a patch for the Linux kernel to use SSDs to cache other block +devices. For more information, see http://bcache.evilpiepirate.org. +Documentation for the run time interface is included in the kernel tree, in +Documentantion/bcache.txt. + +Included tools: + +make-bcache +Formats a block device for use with bcache. A device can be formatted for use +as a cache or as a backing device (requires yet to be implemented kernel +support). The most important option is for specifying the bucket size. +Allocation is done in terms of buckets, and cache hits are counted per bucket; +thus a smaller bucket size will give better cache utilization, but poorer write +performance. The bucket size is intended to be equal to the size of your SSD's +erase blocks, which seems to be 128k-512k for most SSDs; feel free to +experiment. + +probe-bcache +Only necessary until support for the bcache superblock is included +in blkid; in the meantime, provides just enough functionality for a udev script +to create the /dev/disk/by-uuid symlink. The arguments it does support are the +same as for blkid. + +bcache-super-show +Prints the bcache superblock of a cache device or a backing device. diff --git a/bcache-register b/bcache-register new file mode 100755 index 00000000..bf93c7f9 --- /dev/null +++ b/bcache-register @@ -0,0 +1,4 @@ +#!/bin/sh +modprobe -qba bcache +test -f /sys/fs/bcache/register && echo "$1" > /sys/fs/bcache/register + diff --git a/bcache-super-show.8 b/bcache-super-show.8 new file mode 100644 index 00000000..7d15a933 --- /dev/null +++ b/bcache-super-show.8 @@ -0,0 +1,11 @@ +.TH bcache-super-show 8 +.SH NAME +bcache-super-show \- Print the bcache superblock +.SH SYNOPSIS +.B bcache-super-show +[\fB \-f] +.I device +.SH OPTIONS +.TP +.BR \-f +Keep going if the superblock crc is invalid diff --git a/bcache-super-show.c b/bcache-super-show.c new file mode 100644 index 00000000..fab5e810 --- /dev/null +++ b/bcache-super-show.c @@ -0,0 +1,205 @@ +/* + * Author: Gabriel de Perthuis + * + * GPLv2 + */ + +#define _FILE_OFFSET_BITS 64 +#define __USE_FILE_OFFSET64 +#define _XOPEN_SOURCE 500 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bcache.h" + + +static void usage() +{ + fprintf(stderr, "Usage: bcache-super-show [-f] \n"); +} + + +int main(int argc, char **argv) +{ + bool force_csum = false; + int o; + extern char *optarg; + struct cache_sb sb; + char uuid[40]; + uint64_t expected_csum; + + while ((o = getopt(argc, argv, "f")) != EOF) + switch (o) { + case 'f': + force_csum = 1; + break; + + default: + usage(); + exit(1); + } + + argv += optind; + argc -= optind; + + if (argc != 1) { + usage(); + exit(1); + } + + int fd = open(argv[0], O_RDONLY); + if (fd < 0) { + printf("Can't open dev %s: %s\n", argv[0], strerror(errno)); + exit(2); + } + + if (pread(fd, &sb, sizeof(sb), SB_START) != sizeof(sb)) { + fprintf(stderr, "Couldn't read\n"); + exit(2); + } + + printf("sb.magic\t\t"); + if (!memcmp(sb.magic, bcache_magic, 16)) { + printf("ok\n"); + } else { + printf("bad magic\n"); + fprintf(stderr, "Invalid superblock (bad magic)\n"); + exit(2); + } + + printf("sb.first_sector\t\t%" PRIu64, sb.offset); + if (sb.offset == SB_SECTOR) { + printf(" [match]\n"); + } else { + printf(" [expected %ds]\n", SB_SECTOR); + fprintf(stderr, "Invalid superblock (bad sector)\n"); + exit(2); + } + + printf("sb.csum\t\t\t%" PRIX64, sb.csum); + expected_csum = csum_set(&sb); + if (sb.csum == expected_csum) { + printf(" [match]\n"); + } else { + printf(" [expected %" PRIX64 "]\n", expected_csum); + if (!force_csum) { + fprintf(stderr, "Corrupt superblock (bad csum)\n"); + exit(2); + } + } + + printf("sb.version\t\t%" PRIu64, sb.version); + switch (sb.version) { + // These are handled the same by the kernel + case BCACHE_SB_VERSION_CDEV: + case BCACHE_SB_VERSION_CDEV_WITH_UUID: + printf(" [cache device]\n"); + break; + + // The second adds data offset support + case BCACHE_SB_VERSION_BDEV: + case BCACHE_SB_VERSION_BDEV_WITH_OFFSET: + printf(" [backing device]\n"); + break; + + default: + printf(" [unknown]\n"); + // exit code? + return 0; + } + + putchar('\n'); + + uuid_unparse(sb.uuid, uuid); + printf("dev.uuid\t\t%s\n", uuid); + + printf("dev.sectors_per_block\t%u\n" + "dev.sectors_per_bucket\t%u\n", + sb.block_size, + sb.bucket_size); + + if (!SB_IS_BDEV(&sb)) { + // total_sectors includes the superblock; + printf("dev.cache.first_sector\t%u\n" + "dev.cache.cache_sectors\t%ju\n" + "dev.cache.total_sectors\t%ju\n" + "dev.cache.discard\t%s\n" + "dev.cache.pos\t\t%u\n", + sb.bucket_size * sb.first_bucket, + sb.bucket_size * (sb.nbuckets - sb.first_bucket), + sb.bucket_size * sb.nbuckets, + CACHE_DISCARD(&sb) ? "yes" : "no", + sb.nr_this_dev); + } else { + uint64_t first_sector; + if (sb.version == BCACHE_SB_VERSION_BDEV) { + first_sector = BDEV_DATA_START_DEFAULT; + } else { + if (sb.keys == 1 || sb.d[0]) { + fprintf(stderr, + "Possible experimental format detected, bailing\n"); + exit(3); + } + first_sector = sb.data_offset; + } + + printf("dev.data.first_sector\t%ju\n" + "dev.data.cache_mode\t%ju", + first_sector, + BDEV_CACHE_MODE(&sb)); + switch (BDEV_CACHE_MODE(&sb)) { + case CACHE_MODE_WRITETHROUGH: + printf(" [writethrough]\n"); + break; + case CACHE_MODE_WRITEBACK: + printf(" [writeback]\n"); + break; + case CACHE_MODE_WRITEAROUND: + printf(" [writearound]\n"); + break; + case CACHE_MODE_NONE: + printf(" [no caching]\n"); + break; + default: + putchar('\n'); + } + + printf("dev.data.cache_state\t%ju", + BDEV_STATE(&sb)); + switch (BDEV_STATE(&sb)) { + case BDEV_STATE_NONE: + printf(" [detached]\n"); + break; + case BDEV_STATE_CLEAN: + printf(" [clean]\n"); + break; + case BDEV_STATE_DIRTY: + printf(" [dirty]\n"); + break; + case BDEV_STATE_STALE: + printf(" [inconsistent]\n"); + break; + default: + putchar('\n'); + } + } + putchar('\n'); + + uuid_unparse(sb.set_uuid, uuid); + printf("cset.uuid\t\t%s\n", uuid); + + return 0; +} diff --git a/bcache-test.c b/bcache-test.c new file mode 100644 index 00000000..cc164e0e --- /dev/null +++ b/bcache-test.c @@ -0,0 +1,320 @@ +/* + * Author: Kent Overstreet + * + * GPLv2 + */ + +#define _FILE_OFFSET_BITS 64 +#define _XOPEN_SOURCE 500 +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static const unsigned char bcache_magic[] = { + 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca, + 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81 }; + +unsigned char zero[4096]; + +bool klog = false; + +#define Pread(fd, buf, size, offset) do { \ + int _read = 0, _r; \ + while (_read < size) { \ + _r = pread(fd, buf, (size) - _read, (offset) + _read); \ + if (_r <= 0) \ + goto err; \ + _read += _r; \ + } \ +} while (0) + +#define Pwrite(fd, buf, size, offset) do { \ + int _write = 0, _r; \ + while (_write < size) { \ + _r = pwrite(fd, buf, (size) - _write, offset + _write); \ + if (_r < 0) \ + goto err; \ + _write += _r; \ + } \ +} while (0) + +/* Marsaglia polar method + */ +double normal() +{ + double x, y, s; + static double n = 0 / (double) 0; + + if (n == n) { + x = n; + n = 0 / (double) 0; + return x; + } + + do { + x = random() / (double) (RAND_MAX / 2) - 1; + y = random() / (double) (RAND_MAX / 2) - 1; + + s = x * x + y * y; + } while (s >= 1); + + s = sqrt(-2 * log(s) / s); + n = y * s; + return x * s; +} + +long getblocks(int fd) +{ + long ret; + struct stat statbuf; + if (fstat(fd, &statbuf)) { + perror("stat error"); + exit(EXIT_FAILURE); + } + ret = statbuf.st_size / 512; + if (S_ISBLK(statbuf.st_mode)) + if (ioctl(fd, BLKGETSIZE, &ret)) { + perror("ioctl error"); + exit(EXIT_FAILURE); + } + return ret; +} + +struct pagestuff { + unsigned char csum[16]; + unsigned char oldcsum[16]; + int readcount; + int writecount; +}; + +void flushlog(void) +{ + char logbuf[1 << 21]; + int w = 0, len; + static int fd; + + if (!klog) + return; + + if (!fd) { + klogctl(8, 0, 6); + + sprintf(logbuf, "log.%i", abs(random()) % 1000); + fd = open(logbuf, O_WRONLY|O_CREAT|O_TRUNC, 0644); + + if (fd == -1) { + perror("Error opening log file"); + exit(EXIT_FAILURE); + } + } + + len = klogctl(4, logbuf, 1 << 21); + + if (len == -1) { + perror("Error reading kernel log"); + exit(EXIT_FAILURE); + } + + while (w < len) { + int r = write(fd, logbuf + w, len - w); + if (r == -1) { + perror("Error writing log"); + exit(EXIT_FAILURE); + } + w += r; + } +} + +void aio_loop(int nr) +{ + +} + +void usage() +{ + exit(EXIT_FAILURE); +} + +int main(int argc, char **argv) +{ + bool walk = false, randsize = false, verbose = false, csum = false, rtest = false, wtest = false; + int fd1, fd2 = 0, direct = 0, nbytes = 4096, j, o; + unsigned long size, i, offset = 0, done = 0, unique = 0, benchmark = 0; + void *buf1 = NULL, *buf2 = NULL; + struct pagestuff *pages, *p; + unsigned char c[16]; + time_t last_printed = 0; + extern char *optarg; + + RC4_KEY writedata; + RC4_set_key(&writedata, 16, bcache_magic); + + while ((o = getopt(argc, argv, "dnwvscwlb:")) != EOF) + switch (o) { + case 'd': + direct = O_DIRECT; + break; + case 'n': + walk = true; + break; + case 'v': + verbose = true; + break; + case 's': + randsize = true; + break; + case 'c': + csum = true; + break; + case 'w': + wtest = true; + break; + case 'r': + rtest = true; + break; + case 'l': + klog = true; + break; + case 'b': + benchmark = atol(optarg); + break; + default: + usage(); + } + + argv += optind; + argc -= optind; + + if (!rtest && !wtest) + rtest = true; + + if (argc < 1) { + printf("Please enter a device to test\n"); + exit(EXIT_FAILURE); + } + + if (!csum && !benchmark && argc < 2) { + printf("Please enter a device to compare against\n"); + exit(EXIT_FAILURE); + } + + fd1 = open(argv[0], (wtest ? O_RDWR : O_RDONLY)|direct); + if (!csum && !benchmark) + fd2 = open(argv[1], (wtest ? O_RDWR : O_RDONLY)|direct); + + if (fd1 == -1 || fd2 == -1) { + perror("Error opening device"); + exit(EXIT_FAILURE); + } + + size = getblocks(fd1); + if (!csum && !benchmark) + size = MIN(size, getblocks(fd2)); + + size = size / 8 - 16; + pages = calloc(size + 16, sizeof(*pages)); + printf("size %li\n", size); + + if (posix_memalign(&buf1, 4096, 4096 * 16) || + posix_memalign(&buf2, 4096, 4096 * 16)) { + printf("Could not allocate buffers\n"); + exit(EXIT_FAILURE); + } + //setvbuf(stdout, NULL, _IONBF, 0); + + for (i = 0; !benchmark || i < benchmark; i++) { + bool writing = (wtest && (i & 1)) || !rtest; + nbytes = randsize ? drand48() * 16 + 1 : 1; + nbytes <<= 12; + + offset >>= 12; + offset += walk ? normal() * 20 : random(); + offset %= size; + offset <<= 12; + + if (!(i % 200)) + flushlog(); + + if (!verbose) { + time_t now = time(NULL); + if (now - last_printed >= 2) { + last_printed = now; + goto print; + } + } else +print: printf("Loop %6li offset %9li sectors %3i, %6lu mb done, %6lu mb unique\n", + i, offset >> 9, nbytes >> 9, done >> 11, unique >> 11); + + done += nbytes >> 9; + + if (!writing) + Pread(fd1, buf1, nbytes, offset); + if (!writing && !csum && !benchmark) + Pread(fd2, buf2, nbytes, offset); + + for (j = 0; j < nbytes; j += 4096) { + p = &pages[(offset + j) / 4096]; + + if (writing) + RC4(&writedata, 4096, zero, buf1 + j); + + if (csum) { + MD4(buf1 + j, 4096, &c[0]); + + if (writing || + (!p->readcount && !p->writecount)) { + memcpy(&p->oldcsum[0], &p->csum[0], 16); + memcpy(&p->csum[0], c, 16); + } else if (memcmp(&p->csum[0], c, 16)) + goto bad; + } else if (!writing && !benchmark && + memcmp(buf1 + j, + buf2 + j, + 4096)) + goto bad; + + if (!p->writecount && !p->readcount) + unique += 8; + + writing ? p->writecount++ : p->readcount++; + } + if (writing) + Pwrite(fd1, buf1, nbytes, offset); + if (writing && !csum && !benchmark) + Pwrite(fd2, buf2, nbytes, offset); + } + printf("Loop %6li offset %9li sectors %3i, %6lu mb done, %6lu mb unique\n", + i, offset >> 9, nbytes >> 9, done >> 11, unique >> 11); + exit(EXIT_SUCCESS); +err: + perror("IO error"); + flushlog(); + exit(EXIT_FAILURE); +bad: + printf("Bad read! loop %li offset %li readcount %i writecount %i\n", + i, (offset + j) >> 9, p->readcount, p->writecount); + + if (!memcmp(&p->oldcsum[0], c, 16)) + printf("Matches previous csum\n"); + + flushlog(); + exit(EXIT_FAILURE); +} diff --git a/bcache.c b/bcache.c new file mode 100644 index 00000000..8f37445d --- /dev/null +++ b/bcache.c @@ -0,0 +1,129 @@ +#define _GNU_SOURCE + +#include +#include +#include +#include + +/* + * Portions Copyright (c) 1996-2001, PostgreSQL Global Development Group (Any + * use permitted, subject to terms of PostgreSQL license; see.) + + * If we have a 64-bit integer type, then a 64-bit CRC looks just like the + * usual sort of implementation. (See Ross Williams' excellent introduction + * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from + * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.) + * If we have no working 64-bit type, then fake it with two 32-bit registers. + * + * The present implementation is a normal (not "reflected", in Williams' + * terms) 64-bit CRC, using initial all-ones register contents and a final + * bit inversion. The chosen polynomial is borrowed from the DLT1 spec + * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM): + * + * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 + + * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 + + * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 + + * x^7 + x^4 + x + 1 +*/ + +static const uint64_t crc_table[256] = { + 0x0000000000000000ULL, 0x42F0E1EBA9EA3693ULL, 0x85E1C3D753D46D26ULL, + 0xC711223CFA3E5BB5ULL, 0x493366450E42ECDFULL, 0x0BC387AEA7A8DA4CULL, + 0xCCD2A5925D9681F9ULL, 0x8E224479F47CB76AULL, 0x9266CC8A1C85D9BEULL, + 0xD0962D61B56FEF2DULL, 0x17870F5D4F51B498ULL, 0x5577EEB6E6BB820BULL, + 0xDB55AACF12C73561ULL, 0x99A54B24BB2D03F2ULL, 0x5EB4691841135847ULL, + 0x1C4488F3E8F96ED4ULL, 0x663D78FF90E185EFULL, 0x24CD9914390BB37CULL, + 0xE3DCBB28C335E8C9ULL, 0xA12C5AC36ADFDE5AULL, 0x2F0E1EBA9EA36930ULL, + 0x6DFEFF5137495FA3ULL, 0xAAEFDD6DCD770416ULL, 0xE81F3C86649D3285ULL, + 0xF45BB4758C645C51ULL, 0xB6AB559E258E6AC2ULL, 0x71BA77A2DFB03177ULL, + 0x334A9649765A07E4ULL, 0xBD68D2308226B08EULL, 0xFF9833DB2BCC861DULL, + 0x388911E7D1F2DDA8ULL, 0x7A79F00C7818EB3BULL, 0xCC7AF1FF21C30BDEULL, + 0x8E8A101488293D4DULL, 0x499B3228721766F8ULL, 0x0B6BD3C3DBFD506BULL, + 0x854997BA2F81E701ULL, 0xC7B97651866BD192ULL, 0x00A8546D7C558A27ULL, + 0x4258B586D5BFBCB4ULL, 0x5E1C3D753D46D260ULL, 0x1CECDC9E94ACE4F3ULL, + 0xDBFDFEA26E92BF46ULL, 0x990D1F49C77889D5ULL, 0x172F5B3033043EBFULL, + 0x55DFBADB9AEE082CULL, 0x92CE98E760D05399ULL, 0xD03E790CC93A650AULL, + 0xAA478900B1228E31ULL, 0xE8B768EB18C8B8A2ULL, 0x2FA64AD7E2F6E317ULL, + 0x6D56AB3C4B1CD584ULL, 0xE374EF45BF6062EEULL, 0xA1840EAE168A547DULL, + 0x66952C92ECB40FC8ULL, 0x2465CD79455E395BULL, 0x3821458AADA7578FULL, + 0x7AD1A461044D611CULL, 0xBDC0865DFE733AA9ULL, 0xFF3067B657990C3AULL, + 0x711223CFA3E5BB50ULL, 0x33E2C2240A0F8DC3ULL, 0xF4F3E018F031D676ULL, + 0xB60301F359DBE0E5ULL, 0xDA050215EA6C212FULL, 0x98F5E3FE438617BCULL, + 0x5FE4C1C2B9B84C09ULL, 0x1D14202910527A9AULL, 0x93366450E42ECDF0ULL, + 0xD1C685BB4DC4FB63ULL, 0x16D7A787B7FAA0D6ULL, 0x5427466C1E109645ULL, + 0x4863CE9FF6E9F891ULL, 0x0A932F745F03CE02ULL, 0xCD820D48A53D95B7ULL, + 0x8F72ECA30CD7A324ULL, 0x0150A8DAF8AB144EULL, 0x43A04931514122DDULL, + 0x84B16B0DAB7F7968ULL, 0xC6418AE602954FFBULL, 0xBC387AEA7A8DA4C0ULL, + 0xFEC89B01D3679253ULL, 0x39D9B93D2959C9E6ULL, 0x7B2958D680B3FF75ULL, + 0xF50B1CAF74CF481FULL, 0xB7FBFD44DD257E8CULL, 0x70EADF78271B2539ULL, + 0x321A3E938EF113AAULL, 0x2E5EB66066087D7EULL, 0x6CAE578BCFE24BEDULL, + 0xABBF75B735DC1058ULL, 0xE94F945C9C3626CBULL, 0x676DD025684A91A1ULL, + 0x259D31CEC1A0A732ULL, 0xE28C13F23B9EFC87ULL, 0xA07CF2199274CA14ULL, + 0x167FF3EACBAF2AF1ULL, 0x548F120162451C62ULL, 0x939E303D987B47D7ULL, + 0xD16ED1D631917144ULL, 0x5F4C95AFC5EDC62EULL, 0x1DBC74446C07F0BDULL, + 0xDAAD56789639AB08ULL, 0x985DB7933FD39D9BULL, 0x84193F60D72AF34FULL, + 0xC6E9DE8B7EC0C5DCULL, 0x01F8FCB784FE9E69ULL, 0x43081D5C2D14A8FAULL, + 0xCD2A5925D9681F90ULL, 0x8FDAB8CE70822903ULL, 0x48CB9AF28ABC72B6ULL, + 0x0A3B7B1923564425ULL, 0x70428B155B4EAF1EULL, 0x32B26AFEF2A4998DULL, + 0xF5A348C2089AC238ULL, 0xB753A929A170F4ABULL, 0x3971ED50550C43C1ULL, + 0x7B810CBBFCE67552ULL, 0xBC902E8706D82EE7ULL, 0xFE60CF6CAF321874ULL, + 0xE224479F47CB76A0ULL, 0xA0D4A674EE214033ULL, 0x67C58448141F1B86ULL, + 0x253565A3BDF52D15ULL, 0xAB1721DA49899A7FULL, 0xE9E7C031E063ACECULL, + 0x2EF6E20D1A5DF759ULL, 0x6C0603E6B3B7C1CAULL, 0xF6FAE5C07D3274CDULL, + 0xB40A042BD4D8425EULL, 0x731B26172EE619EBULL, 0x31EBC7FC870C2F78ULL, + 0xBFC9838573709812ULL, 0xFD39626EDA9AAE81ULL, 0x3A28405220A4F534ULL, + 0x78D8A1B9894EC3A7ULL, 0x649C294A61B7AD73ULL, 0x266CC8A1C85D9BE0ULL, + 0xE17DEA9D3263C055ULL, 0xA38D0B769B89F6C6ULL, 0x2DAF4F0F6FF541ACULL, + 0x6F5FAEE4C61F773FULL, 0xA84E8CD83C212C8AULL, 0xEABE6D3395CB1A19ULL, + 0x90C79D3FEDD3F122ULL, 0xD2377CD44439C7B1ULL, 0x15265EE8BE079C04ULL, + 0x57D6BF0317EDAA97ULL, 0xD9F4FB7AE3911DFDULL, 0x9B041A914A7B2B6EULL, + 0x5C1538ADB04570DBULL, 0x1EE5D94619AF4648ULL, 0x02A151B5F156289CULL, + 0x4051B05E58BC1E0FULL, 0x87409262A28245BAULL, 0xC5B073890B687329ULL, + 0x4B9237F0FF14C443ULL, 0x0962D61B56FEF2D0ULL, 0xCE73F427ACC0A965ULL, + 0x8C8315CC052A9FF6ULL, 0x3A80143F5CF17F13ULL, 0x7870F5D4F51B4980ULL, + 0xBF61D7E80F251235ULL, 0xFD913603A6CF24A6ULL, 0x73B3727A52B393CCULL, + 0x31439391FB59A55FULL, 0xF652B1AD0167FEEAULL, 0xB4A25046A88DC879ULL, + 0xA8E6D8B54074A6ADULL, 0xEA16395EE99E903EULL, 0x2D071B6213A0CB8BULL, + 0x6FF7FA89BA4AFD18ULL, 0xE1D5BEF04E364A72ULL, 0xA3255F1BE7DC7CE1ULL, + 0x64347D271DE22754ULL, 0x26C49CCCB40811C7ULL, 0x5CBD6CC0CC10FAFCULL, + 0x1E4D8D2B65FACC6FULL, 0xD95CAF179FC497DAULL, 0x9BAC4EFC362EA149ULL, + 0x158E0A85C2521623ULL, 0x577EEB6E6BB820B0ULL, 0x906FC95291867B05ULL, + 0xD29F28B9386C4D96ULL, 0xCEDBA04AD0952342ULL, 0x8C2B41A1797F15D1ULL, + 0x4B3A639D83414E64ULL, 0x09CA82762AAB78F7ULL, 0x87E8C60FDED7CF9DULL, + 0xC51827E4773DF90EULL, 0x020905D88D03A2BBULL, 0x40F9E43324E99428ULL, + 0x2CFFE7D5975E55E2ULL, 0x6E0F063E3EB46371ULL, 0xA91E2402C48A38C4ULL, + 0xEBEEC5E96D600E57ULL, 0x65CC8190991CB93DULL, 0x273C607B30F68FAEULL, + 0xE02D4247CAC8D41BULL, 0xA2DDA3AC6322E288ULL, 0xBE992B5F8BDB8C5CULL, + 0xFC69CAB42231BACFULL, 0x3B78E888D80FE17AULL, 0x7988096371E5D7E9ULL, + 0xF7AA4D1A85996083ULL, 0xB55AACF12C735610ULL, 0x724B8ECDD64D0DA5ULL, + 0x30BB6F267FA73B36ULL, 0x4AC29F2A07BFD00DULL, 0x08327EC1AE55E69EULL, + 0xCF235CFD546BBD2BULL, 0x8DD3BD16FD818BB8ULL, 0x03F1F96F09FD3CD2ULL, + 0x41011884A0170A41ULL, 0x86103AB85A2951F4ULL, 0xC4E0DB53F3C36767ULL, + 0xD8A453A01B3A09B3ULL, 0x9A54B24BB2D03F20ULL, 0x5D45907748EE6495ULL, + 0x1FB5719CE1045206ULL, 0x919735E51578E56CULL, 0xD367D40EBC92D3FFULL, + 0x1476F63246AC884AULL, 0x568617D9EF46BED9ULL, 0xE085162AB69D5E3CULL, + 0xA275F7C11F7768AFULL, 0x6564D5FDE549331AULL, 0x279434164CA30589ULL, + 0xA9B6706FB8DFB2E3ULL, 0xEB46918411358470ULL, 0x2C57B3B8EB0BDFC5ULL, + 0x6EA7525342E1E956ULL, 0x72E3DAA0AA188782ULL, 0x30133B4B03F2B111ULL, + 0xF7021977F9CCEAA4ULL, 0xB5F2F89C5026DC37ULL, 0x3BD0BCE5A45A6B5DULL, + 0x79205D0E0DB05DCEULL, 0xBE317F32F78E067BULL, 0xFCC19ED95E6430E8ULL, + 0x86B86ED5267CDBD3ULL, 0xC4488F3E8F96ED40ULL, 0x0359AD0275A8B6F5ULL, + 0x41A94CE9DC428066ULL, 0xCF8B0890283E370CULL, 0x8D7BE97B81D4019FULL, + 0x4A6ACB477BEA5A2AULL, 0x089A2AACD2006CB9ULL, 0x14DEA25F3AF9026DULL, + 0x562E43B4931334FEULL, 0x913F6188692D6F4BULL, 0xD3CF8063C0C759D8ULL, + 0x5DEDC41A34BBEEB2ULL, 0x1F1D25F19D51D821ULL, 0xD80C07CD676F8394ULL, + 0x9AFCE626CE85B507ULL +}; + +inline uint64_t crc64(const void *_data, size_t len) +{ + uint64_t crc = 0xFFFFFFFFFFFFFFFFULL; + const unsigned char *data = _data; + + while (len--) { + int i = ((int) (crc >> 56) ^ *data++) & 0xFF; + crc = crc_table[i] ^ (crc << 8); + } + + return crc ^ 0xFFFFFFFFFFFFFFFFULL; +} diff --git a/bcache.h b/bcache.h new file mode 100644 index 00000000..1d78da3b --- /dev/null +++ b/bcache.h @@ -0,0 +1,126 @@ +/* + * Author: Kent Overstreet + * + * GPLv2 + */ + +#ifndef _BCACHE_H +#define _BCACHE_H + +#define BITMASK(name, type, field, offset, size) \ +static inline uint64_t name(const type *k) \ +{ return (k->field >> offset) & ~(((uint64_t) ~0) << size); } \ + \ +static inline void SET_##name(type *k, uint64_t v) \ +{ \ + k->field &= ~(~((uint64_t) ~0 << size) << offset); \ + k->field |= v << offset; \ +} + +static const char bcache_magic[] = { + 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca, + 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81 }; + +/* + * Version 0: Cache device + * Version 1: Backing device + * Version 2: Seed pointer into btree node checksum + * Version 3: Cache device with new UUID format + * Version 4: Backing device with data offset + */ +#define BCACHE_SB_VERSION_CDEV 0 +#define BCACHE_SB_VERSION_BDEV 1 +#define BCACHE_SB_VERSION_CDEV_WITH_UUID 3 +#define BCACHE_SB_VERSION_BDEV_WITH_OFFSET 4 +#define BCACHE_SB_MAX_VERSION 4 + +#define SB_SECTOR 8 +#define SB_LABEL_SIZE 32 +#define SB_JOURNAL_BUCKETS 256U +#define BDEV_DATA_START_DEFAULT 16 /* sectors */ +#define SB_START (SB_SECTOR * 512) + +struct cache_sb { + uint64_t csum; + uint64_t offset; /* sector where this sb was written */ + uint64_t version; + + uint8_t magic[16]; + + uint8_t uuid[16]; + union { + uint8_t set_uuid[16]; + uint64_t set_magic; + }; + uint8_t label[SB_LABEL_SIZE]; + + uint64_t flags; + uint64_t seq; + uint64_t pad[8]; + + union { + struct { + /* Cache devices */ + uint64_t nbuckets; /* device size */ + + uint16_t block_size; /* sectors */ + uint16_t bucket_size; /* sectors */ + + uint16_t nr_in_set; + uint16_t nr_this_dev; + }; + struct { + /* Backing devices */ + uint64_t data_offset; + + /* + * block_size from the cache device section is still used by + * backing devices, so don't add anything here until we fix + * things to not need it for backing devices anymore + */ + }; + }; + + uint32_t last_mount; /* time_t */ + + uint16_t first_bucket; + union { + uint16_t njournal_buckets; + uint16_t keys; + }; + uint64_t d[SB_JOURNAL_BUCKETS]; /* journal buckets */ +}; + +static inline bool SB_IS_BDEV(const struct cache_sb *sb) +{ + return sb->version == BCACHE_SB_VERSION_BDEV + || sb->version == BCACHE_SB_VERSION_BDEV_WITH_OFFSET; +} + +BITMASK(CACHE_SYNC, struct cache_sb, flags, 0, 1); +BITMASK(CACHE_DISCARD, struct cache_sb, flags, 1, 1); +BITMASK(CACHE_REPLACEMENT, struct cache_sb, flags, 2, 3); +#define CACHE_REPLACEMENT_LRU 0U +#define CACHE_REPLACEMENT_FIFO 1U +#define CACHE_REPLACEMENT_RANDOM 2U + +BITMASK(BDEV_CACHE_MODE, struct cache_sb, flags, 0, 4); +#define CACHE_MODE_WRITETHROUGH 0U +#define CACHE_MODE_WRITEBACK 1U +#define CACHE_MODE_WRITEAROUND 2U +#define CACHE_MODE_NONE 3U +BITMASK(BDEV_STATE, struct cache_sb, flags, 61, 2); +#define BDEV_STATE_NONE 0U +#define BDEV_STATE_CLEAN 1U +#define BDEV_STATE_DIRTY 2U +#define BDEV_STATE_STALE 3U + +inline uint64_t crc64(const void *_data, size_t len); + +#define node(i, j) ((void *) ((i)->d + (j))) +#define end(i) node(i, (i)->keys) + +#define csum_set(i) \ + crc64(((void *) (i)) + 8, ((void *) end(i)) - (((void *) (i)) + 8)) + +#endif diff --git a/initramfs/hook b/initramfs/hook new file mode 100755 index 00000000..ce328f3a --- /dev/null +++ b/initramfs/hook @@ -0,0 +1,22 @@ +#!/bin/sh + +PREREQ="udev" + +prereqs() +{ + echo "$PREREQ" +} + +case $1 in +prereqs) + prereqs + exit 0 + ;; +esac + +. /usr/share/initramfs-tools/hook-functions + +cp -pt "${DESTDIR}/lib/udev/rules.d" /lib/udev/rules.d/61-bcache.rules +copy_exec /lib/udev/bcache-register +copy_exec /sbin/probe-bcache +manual_add_modules bcache diff --git a/make-bcache.8 b/make-bcache.8 new file mode 100644 index 00000000..337a4148 --- /dev/null +++ b/make-bcache.8 @@ -0,0 +1,26 @@ +.TH make-bcache 8 +.SH NAME +make-bcache \- create a cache device +.SH SYNOPSIS +.B make-bcache +[\fB \-U\ \fIUUID\fR ] +[\fB \-b\ \fIbucket-size\fR ] +.I device +.SH OPTIONS +.TP +.BR \-C +Create a cache +.TP +.BR \-B +Create a backing device (kernel functionality not yet implemented) +.TP +.BR \-U\ \fIUUID +Create a cache device with the specified UUID +.TP +.BR \-b\ \fIbucket-size +Spcifies the bucket size. Allocation is done in terms of buckets, and cache +hits are counted per bucket; thus a smaller bucket size will give better cache +utilization, but poorer write performance. The bucket size is intended to be +equal to the size of your SSD's erase blocks, which seems to be 128k-512k for +most SSDs. Must be a power of two; accepts human readable units. Defaults to +128k. diff --git a/make-bcache.c b/make-bcache.c new file mode 100644 index 00000000..9c038a89 --- /dev/null +++ b/make-bcache.c @@ -0,0 +1,424 @@ +/* + * Author: Kent Overstreet + * + * GPLv2 + */ + +#define _FILE_OFFSET_BITS 64 +#define __USE_FILE_OFFSET64 +#define _XOPEN_SOURCE 600 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bcache.h" + +#define max(x, y) ({ \ + typeof(x) _max1 = (x); \ + typeof(y) _max2 = (y); \ + (void) (&_max1 == &_max2); \ + _max1 > _max2 ? _max1 : _max2; }) + +uint64_t getblocks(int fd) +{ + uint64_t ret; + struct stat statbuf; + if (fstat(fd, &statbuf)) { + perror("stat error\n"); + exit(EXIT_FAILURE); + } + ret = statbuf.st_size / 512; + if (S_ISBLK(statbuf.st_mode)) + if (ioctl(fd, BLKGETSIZE, &ret)) { + perror("ioctl error"); + exit(EXIT_FAILURE); + } + return ret; +} + +uint64_t hatoi(const char *s) +{ + char *e; + long long i = strtoll(s, &e, 10); + switch (*e) { + case 't': + case 'T': + i *= 1024; + case 'g': + case 'G': + i *= 1024; + case 'm': + case 'M': + i *= 1024; + case 'k': + case 'K': + i *= 1024; + } + return i; +} + +unsigned hatoi_validate(const char *s, const char *msg) +{ + uint64_t v = hatoi(s); + + if (v & (v - 1)) { + printf("%s must be a power of two\n", msg); + exit(EXIT_FAILURE); + } + + v /= 512; + + if (v > USHRT_MAX) { + printf("%s too large\n", msg); + exit(EXIT_FAILURE); + } + + if (!v) { + printf("%s too small\n", msg); + exit(EXIT_FAILURE); + } + + return v; +} + +char *skip_spaces(const char *str) +{ + while (isspace(*str)) + ++str; + return (char *)str; +} + +char *strim(char *s) +{ + size_t size; + char *end; + + s = skip_spaces(s); + size = strlen(s); + if (!size) + return s; + + end = s + size - 1; + while (end >= s && isspace(*end)) + end--; + *(end + 1) = '\0'; + + return s; +} + +ssize_t read_string_list(const char *buf, const char * const list[]) +{ + size_t i; + char *s, *d = strdup(buf); + if (!d) + return -ENOMEM; + + s = strim(d); + + for (i = 0; list[i]; i++) + if (!strcmp(list[i], s)) + break; + + free(d); + + if (!list[i]) + return -EINVAL; + + return i; +} + +void usage() +{ + printf("Usage: make-bcache [options] device\n" + " -C, --cache Format a cache device\n" + " -B, --bdev Format a backing device\n" + " -b, --bucket bucket size\n" + " -w, --block block size (hard sector size of SSD, often 2k)\n" + " -o, --data-offset data offset in sectors\n" + " --cset-uuid UUID for the cache set\n" +// " -U UUID\n" + " --writeback enable writeback\n" + " --discard enable discards\n" + " --cache_replacement_policy=(lru|fifo)\n" + " -h, --help display this help and exit\n"); + exit(EXIT_FAILURE); +} + +const char * const cache_replacement_policies[] = { + "lru", + "fifo", + "random", + NULL +}; + +static void write_sb(char *dev, unsigned block_size, unsigned bucket_size, + bool writeback, bool discard, + unsigned cache_replacement_policy, + uint64_t data_offset, + uuid_t set_uuid, bool bdev) +{ + int fd; + char uuid_str[40], set_uuid_str[40]; + struct cache_sb sb; + + if ((fd = open(dev, O_RDWR|O_EXCL)) == -1) { + printf("Can't open dev %s: %s\n", dev, strerror(errno)); + exit(EXIT_FAILURE); + } + + memset(&sb, 0, sizeof(struct cache_sb)); + + sb.offset = SB_SECTOR; + sb.version = bdev + ? BCACHE_SB_VERSION_BDEV + : BCACHE_SB_VERSION_CDEV; + + memcpy(sb.magic, bcache_magic, 16); + uuid_generate(sb.uuid); + memcpy(sb.set_uuid, set_uuid, sizeof(sb.set_uuid)); + + sb.bucket_size = bucket_size; + sb.block_size = block_size; + + uuid_unparse(sb.uuid, uuid_str); + uuid_unparse(sb.set_uuid, set_uuid_str); + + if (SB_IS_BDEV(&sb)) { + SET_BDEV_CACHE_MODE( + &sb, writeback ? CACHE_MODE_WRITEBACK : CACHE_MODE_WRITETHROUGH); + + if (data_offset != BDEV_DATA_START_DEFAULT) { + sb.version = BCACHE_SB_VERSION_BDEV_WITH_OFFSET; + sb.data_offset = data_offset; + } + + printf("UUID: %s\n" + "Set UUID: %s\n" + "version: %u\n" + "block_size: %u\n" + "data_offset: %ju\n", + uuid_str, set_uuid_str, + (unsigned) sb.version, + sb.block_size, + data_offset); + } else { + sb.nbuckets = getblocks(fd) / sb.bucket_size; + sb.nr_in_set = 1; + sb.first_bucket = (23 / sb.bucket_size) + 1; + + if (sb.nbuckets < 1 << 7) { + printf("Not enough buckets: %ju, need %u\n", + sb.nbuckets, 1 << 7); + exit(EXIT_FAILURE); + } + + SET_CACHE_DISCARD(&sb, discard); + SET_CACHE_REPLACEMENT(&sb, cache_replacement_policy); + + printf("UUID: %s\n" + "Set UUID: %s\n" + "version: %u\n" + "nbuckets: %ju\n" + "block_size: %u\n" + "bucket_size: %u\n" + "nr_in_set: %u\n" + "nr_this_dev: %u\n" + "first_bucket: %u\n", + uuid_str, set_uuid_str, + (unsigned) sb.version, + sb.nbuckets, + sb.block_size, + sb.bucket_size, + sb.nr_in_set, + sb.nr_this_dev, + sb.first_bucket); + } + + sb.csum = csum_set(&sb); + + if (pwrite(fd, &sb, sizeof(sb), SB_SECTOR << 9) != sizeof(sb)) { + perror("write error\n"); + exit(EXIT_FAILURE); + } + + fsync(fd); + close(fd); +} + +static unsigned get_blocksize(const char *path) +{ + struct stat statbuf; + + if (stat(path, &statbuf)) { + fprintf(stderr, "Error statting %s: %s\n", + path, strerror(errno)); + exit(EXIT_FAILURE); + } + + if (S_ISBLK(statbuf.st_mode)) { + /* check IO limits: + * BLKALIGNOFF: alignment_offset + * BLKPBSZGET: physical_block_size + * BLKSSZGET: logical_block_size + * BLKIOMIN: minimum_io_size + * BLKIOOPT: optimal_io_size + * + * It may be tempting to use physical_block_size, + * or even minimum_io_size. + * But to be as transparent as possible, + * we want to use logical_block_size. + */ + unsigned int logical_block_size; + int fd = open(path, O_RDONLY); + + if (fd < 0) { + fprintf(stderr, "open(%s) failed: %m\n", path); + exit(EXIT_FAILURE); + } + if (ioctl(fd, BLKSSZGET, &logical_block_size)) { + fprintf(stderr, "ioctl(%s, BLKSSZGET) failed: %m\n", path); + exit(EXIT_FAILURE); + } + close(fd); + return logical_block_size / 512; + + } + /* else: not a block device. + * Why would we even want to write a bcache super block there? */ + + return statbuf.st_blksize / 512; +} + +int main(int argc, char **argv) +{ + int c, bdev = -1; + unsigned i, ncache_devices = 0, nbacking_devices = 0; + char *cache_devices[argc]; + char *backing_devices[argc]; + + unsigned block_size = 0, bucket_size = 1024; + int writeback = 0, discard = 0; + unsigned cache_replacement_policy = 0; + uint64_t data_offset = BDEV_DATA_START_DEFAULT; + uuid_t set_uuid; + + uuid_generate(set_uuid); + + struct option opts[] = { + { "cache", 0, NULL, 'C' }, + { "bdev", 0, NULL, 'B' }, + { "bucket", 1, NULL, 'b' }, + { "block", 1, NULL, 'w' }, + { "writeback", 0, &writeback, 1 }, + { "discard", 0, &discard, 1 }, + { "cache_replacement_policy", 1, NULL, 'p' }, + { "data_offset", 1, NULL, 'o' }, + { "cset-uuid", 1, NULL, 'u' }, + { "help", 0, NULL, 'h' }, + { NULL, 0, NULL, 0 }, + }; + + while ((c = getopt_long(argc, argv, + "-hCBU:w:b:", + opts, NULL)) != -1) + switch (c) { + case 'C': + bdev = 0; + break; + case 'B': + bdev = 1; + break; + case 'b': + bucket_size = hatoi_validate(optarg, "bucket size"); + break; + case 'w': + block_size = hatoi_validate(optarg, "block size"); + break; +#if 0 + case 'U': + if (uuid_parse(optarg, sb.uuid)) { + printf("Bad uuid\n"); + exit(EXIT_FAILURE); + } + break; +#endif + case 'p': + cache_replacement_policy = read_string_list(optarg, + cache_replacement_policies); + break; + case 'o': + data_offset = atoll(optarg); + if (data_offset < BDEV_DATA_START_DEFAULT) { + printf("Bad data offset; minimum %d sectors\n", + BDEV_DATA_START_DEFAULT); + exit(EXIT_FAILURE); + } + break; + case 'u': + if (uuid_parse(optarg, set_uuid)) { + printf("Bad uuid\n"); + exit(EXIT_FAILURE); + } + break; + case 'h': + usage(); + break; + case 1: + if (bdev == -1) { + printf("Please specify -C or -B\n"); + exit(EXIT_FAILURE); + } + + if (bdev) + backing_devices[nbacking_devices++] = optarg; + else + cache_devices[ncache_devices++] = optarg; + break; + } + + if (!ncache_devices && !nbacking_devices) { + printf("Please supply a device\n"); + usage(); + } + + if (bucket_size < block_size) { + printf("Bucket size cannot be smaller than block size\n"); + exit(EXIT_FAILURE); + } + + if (!block_size) { + for (i = 0; i < ncache_devices; i++) + block_size = max(block_size, + get_blocksize(cache_devices[i])); + + for (i = 0; i < nbacking_devices; i++) + block_size = max(block_size, + get_blocksize(backing_devices[i])); + } + + for (i = 0; i < ncache_devices; i++) + write_sb(cache_devices[i], block_size, bucket_size, + writeback, discard, cache_replacement_policy, + data_offset, set_uuid, false); + + for (i = 0; i < nbacking_devices; i++) + write_sb(backing_devices[i], block_size, bucket_size, + writeback, discard, cache_replacement_policy, + data_offset, set_uuid, true); + + return 0; +} diff --git a/probe-bcache.8 b/probe-bcache.8 new file mode 100644 index 00000000..17273f02 --- /dev/null +++ b/probe-bcache.8 @@ -0,0 +1,17 @@ +.TH probe-bcache 8 +.SH NAME +probe-bcache \- probe a bcache device +.SH SYNOPSIS +.B probe-bcache +[\fB \-o\ \fIudev\fR ] +.I device +.SH OPTIONS +.TP +.BR \-o +return UUID in udev style for invocation by udev rule as IMPORT{program} +.SH USAGE +Return UUID if device identified as bcache-formatted. + +Only necessary until support for the bcache superblock is included +in blkid; in the meantime, provides just enough functionality for a udev script +to create the /dev/disk/by-uuid symlink. diff --git a/probe-bcache.c b/probe-bcache.c new file mode 100644 index 00000000..caff7b65 --- /dev/null +++ b/probe-bcache.c @@ -0,0 +1,73 @@ +/* + * Author: Kent Overstreet + * + * GPLv2 + */ + +#define _FILE_OFFSET_BITS 64 +#define __USE_FILE_OFFSET64 +#define _XOPEN_SOURCE 500 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "bcache.h" + +int main(int argc, char **argv) +{ + bool udev = false; + int i, o; + extern char *optarg; + struct cache_sb sb; + char uuid[40]; + + while ((o = getopt(argc, argv, "o:")) != EOF) + switch (o) { + case 'o': + if (strcmp("udev", optarg)) { + printf("Invalid output format %s\n", optarg); + exit(EXIT_FAILURE); + } + udev = true; + break; + } + + + argv += optind; + argc -= optind; + + for (i = 0; i < argc; i++) { + int fd = open(argv[i], O_RDONLY); + if (fd == -1) + continue; + + + if (pread(fd, &sb, sizeof(sb), 4096) != sizeof(sb)) + continue; + + if (memcmp(sb.magic, bcache_magic, 16)) + continue; + + uuid_unparse(sb.uuid, uuid); + + if (udev) + printf("ID_FS_UUID=%s\n" + "ID_FS_UUID_ENC=%s\n" + "ID_FS_TYPE=bcache\n", + uuid, uuid); + else + printf("%s: UUID=\"\" TYPE=\"bcache\"\n", uuid); + } + + return 0; +}