snapraid

2025-12-08 00:00:12 +03:00 · 2018-11-23 00:44:20 -05:00 · 2018-11-23 00:44:20 -05:00 · c416528eaa
commit c416528eaa
parent cc6479303f
23 changed files with 21844 additions and 1 deletions
--- a/2
+++ b/2
@ -4,7 +4,7 @@ INSTALL=install
 CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall				\
 	-Wno-pointer-sign					\
 	-fno-strict-aliasing					\
-	-I. -Iinclude						\
+	-I. -Iinclude -Iraid					\
 	-D_FILE_OFFSET_BITS=64					\
 	-D_GNU_SOURCE						\
 	-D_LGPL_SOURCE						\
--- a/bcachefs.c
+++ b/bcachefs.c
@ -21,6 +21,8 @@
 #include <sys/types.h>
 #include <sys/stat.h>

+#include <raid/raid.h>
+
 #include "cmds.h"

 static void usage(void)
@ -141,6 +143,8 @@ static int data_cmds(int argc, char *argv[])

 int main(int argc, char *argv[])
 {
+	raid_init();
+
 	full_cmd = argv[0];

 	setvbuf(stdout, NULL, _IOLBF, 0);
--- a/raid/COPYING
+++ b/raid/COPYING
@ -0,0 +1,339 @@
+		    GNU GENERAL PUBLIC LICENSE
+		       Version 2, June 1991
+
+ Copyright (C) 1989, 1991 Free Software Foundation, Inc.
+                          675 Mass Ave, Cambridge, MA 02139, USA
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+			    Preamble
+
+  The licenses for most software are designed to take away your
+freedom to share and change it.  By contrast, the GNU General Public
+License is intended to guarantee your freedom to share and change free
+software--to make sure the software is free for all its users.  This
+General Public License applies to most of the Free Software
+Foundation's software and to any other program whose authors commit to
+using it.  (Some other Free Software Foundation software is covered by
+the GNU Library General Public License instead.)  You can apply it to
+your programs, too.
+
+  When we speak of free software, we are referring to freedom, not
+price.  Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+this service if you wish), that you receive source code or can get it
+if you want it, that you can change the software or use pieces of it
+in new free programs; and that you know you can do these things.
+
+  To protect your rights, we need to make restrictions that forbid
+anyone to deny you these rights or to ask you to surrender the rights.
+These restrictions translate to certain responsibilities for you if you
+distribute copies of the software, or if you modify it.
+
+  For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must give the recipients all the rights that
+you have.  You must make sure that they, too, receive or can get the
+source code.  And you must show them these terms so they know their
+rights.
+
+  We protect your rights with two steps: (1) copyright the software, and
+(2) offer you this license which gives you legal permission to copy,
+distribute and/or modify the software.
+
+  Also, for each author's protection and ours, we want to make certain
+that everyone understands that there is no warranty for this free
+software.  If the software is modified by someone else and passed on, we
+want its recipients to know that what they have is not the original, so
+that any problems introduced by others will not reflect on the original
+authors' reputations.
+
+  Finally, any free program is threatened constantly by software
+patents.  We wish to avoid the danger that redistributors of a free
+program will individually obtain patent licenses, in effect making the
+program proprietary.  To prevent this, we have made it clear that any
+patent must be licensed for everyone's free use or not licensed at all.
+
+  The precise terms and conditions for copying, distribution and
+modification follow.
+
+		    GNU GENERAL PUBLIC LICENSE
+   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
+
+  0. This License applies to any program or other work which contains
+a notice placed by the copyright holder saying it may be distributed
+under the terms of this General Public License.  The "Program", below,
+refers to any such program or work, and a "work based on the Program"
+means either the Program or any derivative work under copyright law:
+that is to say, a work containing the Program or a portion of it,
+either verbatim or with modifications and/or translated into another
+language.  (Hereinafter, translation is included without limitation in
+the term "modification".)  Each licensee is addressed as "you".
+
+Activities other than copying, distribution and modification are not
+covered by this License; they are outside its scope.  The act of
+running the Program is not restricted, and the output from the Program
+is covered only if its contents constitute a work based on the
+Program (independent of having been made by running the Program).
+Whether that is true depends on what the Program does.
+
+  1. You may copy and distribute verbatim copies of the Program's
+source code as you receive it, in any medium, provided that you
+conspicuously and appropriately publish on each copy an appropriate
+copyright notice and disclaimer of warranty; keep intact all the
+notices that refer to this License and to the absence of any warranty;
+and give any other recipients of the Program a copy of this License
+along with the Program.
+
+You may charge a fee for the physical act of transferring a copy, and
+you may at your option offer warranty protection in exchange for a fee.
+
+  2. You may modify your copy or copies of the Program or any portion
+of it, thus forming a work based on the Program, and copy and
+distribute such modifications or work under the terms of Section 1
+above, provided that you also meet all of these conditions:
+
+    a) You must cause the modified files to carry prominent notices
+    stating that you changed the files and the date of any change.
+
+    b) You must cause any work that you distribute or publish, that in
+    whole or in part contains or is derived from the Program or any
+    part thereof, to be licensed as a whole at no charge to all third
+    parties under the terms of this License.
+
+    c) If the modified program normally reads commands interactively
+    when run, you must cause it, when started running for such
+    interactive use in the most ordinary way, to print or display an
+    announcement including an appropriate copyright notice and a
+    notice that there is no warranty (or else, saying that you provide
+    a warranty) and that users may redistribute the program under
+    these conditions, and telling the user how to view a copy of this
+    License.  (Exception: if the Program itself is interactive but
+    does not normally print such an announcement, your work based on
+    the Program is not required to print an announcement.)
+
+These requirements apply to the modified work as a whole.  If
+identifiable sections of that work are not derived from the Program,
+and can be reasonably considered independent and separate works in
+themselves, then this License, and its terms, do not apply to those
+sections when you distribute them as separate works.  But when you
+distribute the same sections as part of a whole which is a work based
+on the Program, the distribution of the whole must be on the terms of
+this License, whose permissions for other licensees extend to the
+entire whole, and thus to each and every part regardless of who wrote it.
+
+Thus, it is not the intent of this section to claim rights or contest
+your rights to work written entirely by you; rather, the intent is to
+exercise the right to control the distribution of derivative or
+collective works based on the Program.
+
+In addition, mere aggregation of another work not based on the Program
+with the Program (or with a work based on the Program) on a volume of
+a storage or distribution medium does not bring the other work under
+the scope of this License.
+
+  3. You may copy and distribute the Program (or a work based on it,
+under Section 2) in object code or executable form under the terms of
+Sections 1 and 2 above provided that you also do one of the following:
+
+    a) Accompany it with the complete corresponding machine-readable
+    source code, which must be distributed under the terms of Sections
+    1 and 2 above on a medium customarily used for software interchange; or,
+
+    b) Accompany it with a written offer, valid for at least three
+    years, to give any third party, for a charge no more than your
+    cost of physically performing source distribution, a complete
+    machine-readable copy of the corresponding source code, to be
+    distributed under the terms of Sections 1 and 2 above on a medium
+    customarily used for software interchange; or,
+
+    c) Accompany it with the information you received as to the offer
+    to distribute corresponding source code.  (This alternative is
+    allowed only for noncommercial distribution and only if you
+    received the program in object code or executable form with such
+    an offer, in accord with Subsection b above.)
+
+The source code for a work means the preferred form of the work for
+making modifications to it.  For an executable work, complete source
+code means all the source code for all modules it contains, plus any
+associated interface definition files, plus the scripts used to
+control compilation and installation of the executable.  However, as a
+special exception, the source code distributed need not include
+anything that is normally distributed (in either source or binary
+form) with the major components (compiler, kernel, and so on) of the
+operating system on which the executable runs, unless that component
+itself accompanies the executable.
+
+If distribution of executable or object code is made by offering
+access to copy from a designated place, then offering equivalent
+access to copy the source code from the same place counts as
+distribution of the source code, even though third parties are not
+compelled to copy the source along with the object code.
+
+  4. You may not copy, modify, sublicense, or distribute the Program
+except as expressly provided under this License.  Any attempt
+otherwise to copy, modify, sublicense or distribute the Program is
+void, and will automatically terminate your rights under this License.
+However, parties who have received copies, or rights, from you under
+this License will not have their licenses terminated so long as such
+parties remain in full compliance.
+
+  5. You are not required to accept this License, since you have not
+signed it.  However, nothing else grants you permission to modify or
+distribute the Program or its derivative works.  These actions are
+prohibited by law if you do not accept this License.  Therefore, by
+modifying or distributing the Program (or any work based on the
+Program), you indicate your acceptance of this License to do so, and
+all its terms and conditions for copying, distributing or modifying
+the Program or works based on it.
+
+  6. Each time you redistribute the Program (or any work based on the
+Program), the recipient automatically receives a license from the
+original licensor to copy, distribute or modify the Program subject to
+these terms and conditions.  You may not impose any further
+restrictions on the recipients' exercise of the rights granted herein.
+You are not responsible for enforcing compliance by third parties to
+this License.
+
+  7. If, as a consequence of a court judgment or allegation of patent
+infringement or for any other reason (not limited to patent issues),
+conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License.  If you cannot
+distribute so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you
+may not distribute the Program at all.  For example, if a patent
+license would not permit royalty-free redistribution of the Program by
+all those who receive copies directly or indirectly through you, then
+the only way you could satisfy both it and this License would be to
+refrain entirely from distribution of the Program.
+
+If any portion of this section is held invalid or unenforceable under
+any particular circumstance, the balance of the section is intended to
+apply and the section as a whole is intended to apply in other
+circumstances.
+
+It is not the purpose of this section to induce you to infringe any
+patents or other property right claims or to contest validity of any
+such claims; this section has the sole purpose of protecting the
+integrity of the free software distribution system, which is
+implemented by public license practices.  Many people have made
+generous contributions to the wide range of software distributed
+through that system in reliance on consistent application of that
+system; it is up to the author/donor to decide if he or she is willing
+to distribute software through any other system and a licensee cannot
+impose that choice.
+
+This section is intended to make thoroughly clear what is believed to
+be a consequence of the rest of this License.
+
+  8. If the distribution and/or use of the Program is restricted in
+certain countries either by patents or by copyrighted interfaces, the
+original copyright holder who places the Program under this License
+may add an explicit geographical distribution limitation excluding
+those countries, so that distribution is permitted only in or among
+countries not thus excluded.  In such case, this License incorporates
+the limitation as if written in the body of this License.
+
+  9. The Free Software Foundation may publish revised and/or new versions
+of the General Public License from time to time.  Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+Each version is given a distinguishing version number.  If the Program
+specifies a version number of this License which applies to it and "any
+later version", you have the option of following the terms and conditions
+either of that version or of any later version published by the Free
+Software Foundation.  If the Program does not specify a version number of
+this License, you may choose any version ever published by the Free Software
+Foundation.
+
+  10. If you wish to incorporate parts of the Program into other free
+programs whose distribution conditions are different, write to the author
+to ask for permission.  For software which is copyrighted by the Free
+Software Foundation, write to the Free Software Foundation; we sometimes
+make exceptions for this.  Our decision will be guided by the two goals
+of preserving the free status of all derivatives of our free software and
+of promoting the sharing and reuse of software generally.
+
+			    NO WARRANTY
+
+  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
+FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
+OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
+PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
+OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
+TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
+PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
+REPAIR OR CORRECTION.
+
+  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
+REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
+INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
+OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
+TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
+YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
+PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGES.
+
+		     END OF TERMS AND CONDITIONS
+
+	Appendix: How to Apply These Terms to Your New Programs
+
+  If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+  To do so, attach the following notices to the program.  It is safest
+to attach them to the start of each source file to most effectively
+convey the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+    <one line to give the program's name and a brief idea of what it does.>
+    Copyright (C) 19yy  <name of author>
+
+    This program is free software; you can redistribute it and/or modify
+    it under the terms of the GNU General Public License as published by
+    the Free Software Foundation; either version 2 of the License, or
+    (at your option) any later version.
+
+    This program is distributed in the hope that it will be useful,
+    but WITHOUT ANY WARRANTY; without even the implied warranty of
+    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+    GNU General Public License for more details.
+
+    You should have received a copy of the GNU General Public License
+    along with this program; if not, write to the Free Software
+    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+
+Also add information on how to contact you by electronic and paper mail.
+
+If the program is interactive, make it output a short notice like this
+when it starts in an interactive mode:
+
+    Gnomovision version 69, Copyright (C) 19yy name of author
+    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+    This is free software, and you are welcome to redistribute it
+    under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License.  Of course, the commands you use may
+be called something other than `show w' and `show c'; they could even be
+mouse-clicks or menu items--whatever suits your program.
+
+You should also get your employer (if you work as a programmer) or your
+school, if any, to sign a "copyright disclaimer" for the program, if
+necessary.  Here is a sample; alter the names:
+
+  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
+  `Gnomovision' (which makes passes at compilers) written by James Hacker.
+
+  <signature of Ty Coon>, 1 April 1989
+  Ty Coon, President of Vice
+
+This General Public License does not permit incorporating your program into
+proprietary programs.  If your program is a subroutine library, you may
+consider it more useful to permit linking proprietary applications with the
+library.  If this is what you want to do, use the GNU Library General
+Public License instead of this License.
--- a/raid/check.c
+++ b/raid/check.c
@ -0,0 +1,185 @@
+/*
+ * Copyright (C) 2015 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+#include "combo.h"
+#include "gf.h"
+
+/**
+ * Validate the provided failed blocks.
+ *
+ * This function checks if the specified failed blocks satisfy the redundancy
+ * information using the data from the known valid parity blocks.
+ *
+ * It's similar at raid_check(), just with a different format for arguments.
+ *
+ * The number of failed blocks @nr must be strictly less than the number of
+ * parities @nv, because you need one more parity to validate the recovering.
+ *
+ * No data or parity blocks are modified.
+ *
+ * @nr Number of failed data blocks.
+ * @id[] Vector of @nr indexes of the failed data blocks.
+ *   The indexes start from 0. They must be in order.
+ * @nv Number of valid parity blocks.
+ * @ip[] Vector of @nv indexes of the valid parity blocks.
+ *   The indexes start from 0. They must be in order.
+ * @nd Number of data blocks.
+ * @size Size of the blocks pointed by @v. It must be a multipler of 64.
+ * @v Vector of pointers to the blocks of data and parity.
+ *   It has (@nd + @ip[@nv - 1] + 1) elements. The starting elements are the
+ *   blocks for data, following with the parity blocks.
+ *   Each block has @size bytes. 
+ * @return 0 if the check is satisfied. -1 otherwise.
+ */
+static int raid_validate(int nr, int *id, int nv, int *ip, int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	const uint8_t *T[RAID_PARITY_MAX][RAID_PARITY_MAX];
+	uint8_t G[RAID_PARITY_MAX * RAID_PARITY_MAX];
+	uint8_t V[RAID_PARITY_MAX * RAID_PARITY_MAX];
+	size_t i;
+	int j, k, l;
+
+	BUG_ON(nr >= nv);
+
+	/* setup the coefficients matrix */
+	for (j = 0; j < nr; ++j)
+		for (k = 0; k < nr; ++k)
+			G[j * nr + k] = A(ip[j], id[k]);
+
+	/* invert it to solve the system of linear equations */
+	raid_invert(G, V, nr);
+
+	/* get multiplication tables */
+	for (j = 0; j < nr; ++j)
+		for (k = 0; k < nr; ++k)
+			T[j][k] = table(V[j * nr + k]);
+
+	/* check all positions */
+	for (i = 0; i < size; ++i) {
+		uint8_t p[RAID_PARITY_MAX];
+
+		/* get parity */
+		for (j = 0; j < nv; ++j)
+			p[j] = v[nd + ip[j]][i];
+
+		/* compute delta parity, skipping broken disks */
+		for (j = 0, k = 0; j < nd; ++j) {
+			uint8_t b;
+
+			/* skip broken disks */
+			if (k < nr && id[k] == j) {
+				++k;
+				continue;
+			}
+
+			b = v[j][i];
+			for (l = 0; l < nv; ++l)
+				p[l] ^= gfmul[b][gfgen[ip[l]][j]];
+		}
+
+		/* reconstruct data */
+		for (j = 0; j < nr; ++j) {
+			uint8_t b = 0;
+			int idj = id[j];
+
+			/* recompute the data */
+			for (k = 0; k < nr; ++k)
+				b ^= T[j][k][p[k]];
+
+			/* add the parity contribution of the reconstructed data */
+			for (l = nr; l < nv; ++l)
+				p[l] ^= gfmul[b][gfgen[ip[l]][idj]];
+		}
+
+		/* check that the final parity is 0 */
+		for (l = nr; l < nv; ++l)
+			if (p[l] != 0)
+				return -1;
+	}
+
+	return 0;
+}
+
+int raid_check(int nr, int *ir, int nd, int np, size_t size, void **v)
+{
+	/* valid parity index */
+	int ip[RAID_PARITY_MAX];
+	int vp;
+	int rd;
+	int i, j;
+
+	/* enforce limit on size */
+	BUG_ON(size % 64 != 0);
+
+	/* enforce limit on number of failures */
+	BUG_ON(nr >= np); /* >= because we check with extra parity */
+	BUG_ON(np > RAID_PARITY_MAX);
+
+	/* enforce order in index vector */
+	BUG_ON(nr >= 2 && ir[0] >= ir[1]);
+	BUG_ON(nr >= 3 && ir[1] >= ir[2]);
+	BUG_ON(nr >= 4 && ir[2] >= ir[3]);
+	BUG_ON(nr >= 5 && ir[3] >= ir[4]);
+	BUG_ON(nr >= 6 && ir[4] >= ir[5]);
+
+	/* enforce limit on index vector */
+	BUG_ON(nr > 0 && ir[nr-1] >= nd + np);
+
+	/* count failed data disk */
+	rd = 0;
+	while (rd < nr && ir[rd] < nd)
+		++rd;
+
+	/* put valid parities into ip[] */
+	vp = 0;
+	for (i = rd, j = 0; j < np; ++j) {
+		/* if parity is failed */
+		if (i < nr && ir[i] == nd + j) {
+			/* skip broken parity */
+			++i;
+		} else {
+			/* store valid parity */
+			ip[vp] = j;
+			++vp;
+		}
+	}
+
+	return raid_validate(rd, ir, vp, ip, nd, size, v);
+}
+
+int raid_scan(int *ir, int nd, int np, size_t size, void **v)
+{
+	int r;
+
+	/* check the special case of no failure */
+	if (np != 0 && raid_check(0, 0, nd, np, size, v) == 0)
+		return 0;
+
+	/* for each number of possible failures */
+	for (r = 1; r < np; ++r) {
+		/* try all combinations of r failures on n disks */
+		combination_first(r, nd + np, ir);
+		do {
+			/* verify if the combination is a valid one */
+			if (raid_check(r, ir, nd, np, size, v) == 0)
+				return r;
+		} while (combination_next(r, nd + np, ir));
+	}
+
+	/* no solution found */
+	return -1;
+}
+
--- a/raid/combo.h
+++ b/raid/combo.h
@ -0,0 +1,155 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __RAID_COMBO_H
+#define __RAID_COMBO_H
+
+#include <assert.h>
+
+/**
+ * Get the first permutation with repetition of r of n elements.
+ *
+ * Typical use is with permutation_next() in the form :
+ *
+ * int i[R];
+ * permutation_first(R, N, i);
+ * do {
+ *    code using i[0], i[1], ..., i[R-1]
+ * } while (permutation_next(R, N, i));
+ *
+ * It's equivalent at the code :
+ *
+ * for(i[0]=0;i[0]<N;++i[0])
+ *     for(i[1]=0;i[1]<N;++i[1])
+ *        ...
+ *            for(i[R-2]=0;i[R-2]<N;++i[R-2])
+ *                for(i[R-1]=0;i[R-1]<N;++i[R-1])
+ *                    code using i[0], i[1], ..., i[R-1]
+ */
+static __always_inline void permutation_first(int r, int n, int *c)
+{
+	int i;
+
+	(void)n; /* unused, but kept for clarity */
+	assert(0 < r && r <= n);
+
+	for (i = 0; i < r; ++i)
+		c[i] = 0;
+}
+
+/**
+ * Get the next permutation with repetition of r of n elements.
+ * Return ==0 when finished.
+ */
+static __always_inline int permutation_next(int r, int n, int *c)
+{
+	int i = r - 1; /* present position */
+
+recurse:
+	/* next element at position i */
+	++c[i];
+
+	/* if the position has reached the max */
+	if (c[i] >= n) {
+
+		/* if we are at the first level, we have finished */
+		if (i == 0)
+			return 0;
+
+		/* increase the previous position */
+		--i;
+		goto recurse;
+	}
+
+	++i;
+
+	/* initialize all the next positions, if any */
+	while (i < r) {
+		c[i] = 0;
+		++i;
+	}
+
+	return 1;
+}
+
+/**
+ * Get the first combination without repetition of r of n elements.
+ *
+ * Typical use is with combination_next() in the form :
+ *
+ * int i[R];
+ * combination_first(R, N, i);
+ * do {
+ *    code using i[0], i[1], ..., i[R-1]
+ * } while (combination_next(R, N, i));
+ *
+ * It's equivalent at the code :
+ *
+ * for(i[0]=0;i[0]<N-(R-1);++i[0])
+ *     for(i[1]=i[0]+1;i[1]<N-(R-2);++i[1])
+ *        ...
+ *            for(i[R-2]=i[R-3]+1;i[R-2]<N-1;++i[R-2])
+ *                for(i[R-1]=i[R-2]+1;i[R-1]<N;++i[R-1])
+ *                    code using i[0], i[1], ..., i[R-1]
+ */
+static __always_inline void combination_first(int r, int n, int *c)
+{
+	int i;
+
+	(void)n; /* unused, but kept for clarity */
+	assert(0 < r && r <= n);
+
+	for (i = 0; i < r; ++i)
+		c[i] = i;
+}
+
+/**
+ * Get the next combination without repetition of r of n elements.
+ * Return ==0 when finished.
+ */
+static __always_inline int combination_next(int r, int n, int *c)
+{
+	int i = r - 1; /* present position */
+	int h = n; /* high limit for this position */
+
+recurse:
+	/* next element at position i */
+	++c[i];
+
+	/* if the position has reached the max */
+	if (c[i] >= h) {
+
+		/* if we are at the first level, we have finished */
+		if (i == 0)
+			return 0;
+
+		/* increase the previous position */
+		--i;
+		--h;
+		goto recurse;
+	}
+
+	++i;
+
+	/* initialize all the next positions, if any */
+	while (i < r) {
+		/* each position start at the next value of the previous one */
+		c[i] = c[i - 1] + 1;
+		++i;
+	}
+
+	return 1;
+}
+#endif
+
--- a/raid/cpu.h
+++ b/raid/cpu.h
@ -0,0 +1,331 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __RAID_CPU_H
+#define __RAID_CPU_H
+
+#ifdef CONFIG_X86
+
+static inline void raid_cpuid(uint32_t func_eax, uint32_t sub_ecx, uint32_t *reg)
+{
+	asm volatile (
+#if defined(__i386__) && defined(__PIC__)
+	        /* allow compilation in PIC mode saving ebx */
+		"xchgl %%ebx, %1\n"
+		"cpuid\n"
+		"xchgl %%ebx, %1\n"
+		: "=a" (reg[0]), "=r" (reg[1]), "=c" (reg[2]), "=d" (reg[3])
+		: "0" (func_eax), "2" (sub_ecx)
+#else
+		"cpuid\n"
+		: "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3])
+		: "0" (func_eax), "2" (sub_ecx)
+#endif
+	);
+}
+
+static inline void raid_xgetbv(uint32_t* reg)
+{
+	/* get the value of the Extended Control Register ecx=0 */
+	asm volatile (
+	        /* uses a direct encoding of the XGETBV instruction as only recent */
+	        /* assemblers support it. */
+	        /* the next line is equivalent at: "xgetbv\n" */
+		".byte 0x0f, 0x01, 0xd0\n"
+		: "=a" (reg[0]), "=d" (reg[3])
+		: "c" (0)
+	);
+}
+
+#define CPU_VENDOR_MAX 13
+
+static inline void raid_cpu_info(char *vendor, unsigned *family, unsigned *model)
+{
+	uint32_t reg[4];
+	unsigned f, ef, m, em;
+
+	raid_cpuid(0, 0, reg);
+
+	((uint32_t*)vendor)[0] = reg[1];
+	((uint32_t*)vendor)[1] = reg[3];
+	((uint32_t*)vendor)[2] = reg[2];
+	vendor[12] = 0;
+
+	raid_cpuid(1, 0, reg);
+
+	f = (reg[0] >> 8) & 0xF;
+	ef = (reg[0] >> 20) & 0xFF;
+	m = (reg[0] >> 4) & 0xF;
+	em = (reg[0] >> 16) & 0xF;
+
+	if (strcmp(vendor, "AuthenticAMD") == 0) {
+		if (f < 15) {
+			*family = f;
+			*model = m;
+		} else {
+			*family = f + ef;
+			*model = m + (em << 4);
+		}
+	} else {
+		*family = f + ef;
+		*model = m + (em << 4);
+	}
+}
+
+static inline int raid_cpu_match_sse(uint32_t cpuid_1_ecx, uint32_t cpuid_1_edx)
+{
+	uint32_t reg[4];
+
+	raid_cpuid(1, 0, reg);
+	if ((reg[2] & cpuid_1_ecx) != cpuid_1_ecx)
+		return 0;
+	if ((reg[3] & cpuid_1_edx) != cpuid_1_edx)
+		return 0;
+
+	return 1;
+}
+
+static inline int raid_cpu_match_avx(uint32_t cpuid_1_ecx, uint32_t cpuid_7_ebx, uint32_t xcr0)
+{
+	uint32_t reg[4];
+
+	raid_cpuid(1, 0, reg);
+	if ((reg[2] & cpuid_1_ecx) != cpuid_1_ecx)
+		return 0;
+
+	raid_xgetbv(reg);
+	if ((reg[0] & xcr0) != xcr0)
+		return 0;
+
+	raid_cpuid(7, 0, reg);
+	if ((reg[1] & cpuid_7_ebx) != cpuid_7_ebx)
+		return 0;
+
+	return 1;
+}
+
+static inline int raid_cpu_has_sse2(void)
+{
+	/*
+	 * Intel® 64 and IA-32 Architectures Software Developer's Manual
+	 * 325462-048US September 2013
+	 *
+	 * 11.6.2 Checking for SSE/SSE2 Support
+	 * Before an application attempts to use the SSE and/or SSE2 extensions, it should check
+	 * that they are present on the processor:
+	 * 1. Check that the processor supports the CPUID instruction. Bit 21 of the EFLAGS
+	 * register can be used to check processor's support the CPUID instruction.
+	 * 2. Check that the processor supports the SSE and/or SSE2 extensions (true if
+	 * CPUID.01H:EDX.SSE[bit 25] = 1 and/or CPUID.01H:EDX.SSE2[bit 26] = 1).
+	 */
+	return raid_cpu_match_sse(
+		0,
+		1 << 26); /* SSE2 */
+}
+
+static inline int raid_cpu_has_ssse3(void)
+{
+	/*
+	 * Intel® 64 and IA-32 Architectures Software Developer's Manual
+	 * 325462-048US September 2013
+	 *
+	 * 12.7.2 Checking for SSSE3 Support
+	 * Before an application attempts to use the SSSE3 extensions, the application should
+	 * follow the steps illustrated in Section 11.6.2, "Checking for SSE/SSE2 Support."
+	 * Next, use the additional step provided below:
+	 * Check that the processor supports SSSE3 (if CPUID.01H:ECX.SSSE3[bit 9] = 1).
+	 */
+	return raid_cpu_match_sse(
+		1 << 9, /* SSSE3 */
+		1 << 26); /* SSE2 */
+}
+
+static inline int raid_cpu_has_crc32(void)
+{
+	/*
+	 * Intel® 64 and IA-32 Architectures Software Developer's Manual
+	 * 325462-048US September 2013
+	 *
+	 * 12.12.3 Checking for SSE4.2 Support
+	 * ...
+	 * Before an application attempts to use the CRC32 instruction, it must check
+	 * that the processor supports SSE4.2 (if CPUID.01H:ECX.SSE4_2[bit 20] = 1).
+	 */
+	return raid_cpu_match_sse(
+		1 << 20, /* CRC32 */
+		0);
+}
+
+static inline int raid_cpu_has_avx2(void)
+{
+	/*
+	 * Intel Architecture Instruction Set Extensions Programming Reference
+	 * 319433-022 October 2014
+	 *
+	 * 14.3 Detection of AVX instructions
+	 * 1) Detect CPUID.1:ECX.OSXSAVE[bit 27] = 1 (XGETBV enabled for application use1)
+	 * 2) Issue XGETBV and verify that XCR0[2:1] = `11b' (XMM state and YMM state are enabled by OS).
+	 * 3) detect CPUID.1:ECX.AVX[bit 28] = 1 (AVX instructions supported).
+	 * (Step 3 can be done in any order relative to 1 and 2)
+	 *
+	 * 14.7.1 Detection of AVX2
+	 * Hardware support for AVX2 is indicated by CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]=1.
+	 * Application Software must identify that hardware supports AVX, after that it must
+	 * also detect support for AVX2 by checking CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5].
+	 */
+	return raid_cpu_match_avx(
+		(1 << 27) | (1 << 28), /* OSXSAVE and AVX */
+		1 << 5, /* AVX2 */
+		3 << 1); /* OS saves XMM and YMM registers */
+}
+
+static inline int raid_cpu_has_avx512bw(void)
+{
+	/*
+	 * Intel Architecture Instruction Set Extensions Programming Reference
+	 * 319433-022 October 2014
+	 *
+	 * 2.2 Detection of 512-bit Instruction Groups of Intel AVX-512 Family
+	 * 1) Detect CPUID.1:ECX.OSXSAVE[bit 27] = 1 (XGETBV enabled for application use)
+	 * 2) Execute XGETBV and verify that XCR0[7:5] = `111b' (OPMASK state, upper 256-bit of
+	 * ZMM0-ZMM15 and ZMM16-ZMM31 state are enabled by OS) and that XCR0[2:1] = `11b'
+	 * (XMM state and YMM state are enabled by OS).
+	 * 3) Verify both CPUID.0x7.0:EBX.AVX512F[bit 16] = 1, CPUID.0x7.0:EBX.AVX512BW[bit 30] = 1.
+	 */
+
+	/* note that intentionally we don't check for AVX and AVX2 */
+	/* because the documentation doesn't require that */
+	return raid_cpu_match_avx(
+		1 << 27, /* XSAVE/XGETBV */
+		(1 << 16) | (1 << 30), /* AVX512F and AVX512BW */
+		(3 << 1) | (7 << 5)); /* OS saves XMM, YMM and ZMM registers */
+}
+
+/**
+ * Check if it's an Intel Atom CPU.
+ */
+static inline int raid_cpu_is_atom(unsigned family, unsigned model)
+{
+	if (family != 6)
+		return 0;
+
+	/*
+	 * x86 Architecture CPUID
+	 * http://www.sandpile.org/x86/cpuid.htm
+	 *
+	 * Intel Atom
+	 * 1C (28) Atom (45 nm) with 512 KB on-die L2
+	 * 26 (38) Atom (45 nm) with 512 KB on-die L2
+	 * 36 (54) Atom (32 nm) with 512 KB on-die L2
+	 * 27 (39) Atom (32 nm) with 512 KB on-die L2
+	 * 35 (53) Atom (?? nm) with ??? KB on-die L2
+	 * 4A (74) Atom 2C (22 nm) 1 MB L2 + PowerVR (TGR)
+	 * 5A (90) Atom 4C (22 nm) 2 MB L2 + PowerVR (ANN)
+	 * 37 (55) Atom 4C (22 nm) 2 MB L2 + Intel Gen7 (BYT)
+	 * 4C (76) Atom 4C (14 nm) 2 MB L2 + Intel Gen8 (BSW)
+	 * 5D (93) Atom 4C (28 nm TSMC) 1 MB L2 + Mali (SoFIA)
+	 * 4D (77) Atom 8C (22 nm) 4 MB L2 (AVN)
+	 * ?? Atom ?C (14 nm) ? MB L2 (DVN)
+	 */
+	return model == 28 || model == 38 || model == 54
+		|| model == 39 || model == 53 || model == 74
+		|| model == 90 || model == 55 || model == 76
+		|| model == 93 || model == 77;
+}
+
+/**
+ * Check if the processor has a slow MULT implementation.
+ * If yes, it's better to use a hash not based on multiplication.
+ */
+static inline int raid_cpu_has_slowmult(void)
+{
+	char vendor[CPU_VENDOR_MAX];
+	unsigned family;
+	unsigned model;
+
+	/*
+	 * In some cases Murmur3 based on MUL instruction,
+	 * is a LOT slower than Spooky2 based on SHIFTs.
+	 */
+	raid_cpu_info(vendor, &family, &model);
+
+	if (strcmp(vendor, "GenuineIntel") == 0) {
+		/*
+		 * Intel Atom (Model 28)
+		 * murmur3:378 MB/s, spooky2:3413 MB/s (x86)
+		 *
+		 * Intel Atom (Model 77)
+		 * murmur3:1311 MB/s, spooky2:4056 MB/s (x64)
+		 */
+		if (raid_cpu_is_atom(family, model))
+			return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * Check if the processor has a slow extended set of SSE registers.
+ * If yes, it's better to limit the unroll to the firsrt 8 registers.
+ */
+static inline int raid_cpu_has_slowextendedreg(void)
+{
+	char vendor[CPU_VENDOR_MAX];
+	unsigned family;
+	unsigned model;
+
+	/*
+	 * In some cases the PAR2 implementation using 16 SSE registers
+	 * is a LITTLE slower than the one using only the first 8 registers.
+	 * This doesn't happen for PARZ.
+	 */
+	raid_cpu_info(vendor, &family, &model);
+
+	if (strcmp(vendor, "AuthenticAMD") == 0) {
+		/*
+		 * AMD Bulldozer
+		 * par2_sse2:4922 MB/s, par2_sse2e:4465 MB/s
+		 */
+		if (family == 21)
+			return 1;
+	}
+
+	if (strcmp(vendor, "GenuineIntel") == 0) {
+		/*
+		 * Intel Atom (Model 77)
+		 * par2_sse2:5686 MB/s, par2_sse2e:5250 MB/s
+		 * parz_sse2:3100 MB/s, parz_sse2e:3400 MB/s
+		 * par3_sse3:1921 MB/s, par3_sse3e:1813 MB/s
+		 * par4_sse3:1175 MB/s, par4_sse3e:1113 MB/s
+		 * par5_sse3:876 MB/s, par5_sse3e:675 MB/s
+		 * par6_sse3:705 MB/s, par6_sse3e:529 MB/s
+		 *
+		 * Intel Atom (Model 77) "Avoton C2750"
+		 * par2_sse2:5661 MB/s, par2_sse2e:5382 MB/s
+		 * parz_sse2:3110 MB/s, parz_sse2e:3450 MB/s
+		 * par3_sse3:1769 MB/s, par3_sse3e:1856 MB/s
+		 * par4_sse3:1221 MB/s, par4_sse3e:1141 MB/s
+		 * par5_sse3:910 MB/s, par5_sse3e:675 MB/s
+		 * par6_sse3:720 MB/s, par6_sse3e:534 MB/s
+		 */
+		if (raid_cpu_is_atom(family, model))
+			return 1;
+	}
+
+	return 0;
+}
+#endif
+
+#endif
+
--- a/raid/gf.h
+++ b/raid/gf.h
@ -0,0 +1,137 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __RAID_GF_H
+#define __RAID_GF_H
+
+/*
+ * Galois field operations.
+ *
+ * Basic range checks are implemented using BUG_ON().
+ */
+
+/*
+ * GF a*b.
+ */
+static __always_inline uint8_t mul(uint8_t a, uint8_t b)
+{
+	return gfmul[a][b];
+}
+
+/*
+ * GF 1/a.
+ * Not defined for a == 0.
+ */
+static __always_inline uint8_t inv(uint8_t v)
+{
+	BUG_ON(v == 0); /* division by zero */
+
+	return gfinv[v];
+}
+
+/*
+ * GF 2^a.
+ */
+static __always_inline uint8_t pow2(int v)
+{
+	BUG_ON(v < 0 || v > 254); /* invalid exponent */
+
+	return gfexp[v];
+}
+
+/*
+ * Gets the multiplication table for a specified value.
+ */
+static __always_inline const uint8_t *table(uint8_t v)
+{
+	return gfmul[v];
+}
+
+/*
+ * Gets the generator matrix coefficient for parity 'p' and disk 'd'.
+ */
+static __always_inline uint8_t A(int p, int d)
+{
+	return gfgen[p][d];
+}
+
+/*
+ * Dereference as uint8_t
+ */
+#define v_8(p) (*(uint8_t *)&(p))
+
+/*
+ * Dereference as uint32_t
+ */
+#define v_32(p) (*(uint32_t *)&(p))
+
+/*
+ * Dereference as uint64_t
+ */
+#define v_64(p) (*(uint64_t *)&(p))
+
+/*
+ * Multiply each byte of a uint32 by 2 in the GF(2^8).
+ */
+static __always_inline uint32_t x2_32(uint32_t v)
+{
+	uint32_t mask = v & 0x80808080U;
+
+	mask = (mask << 1) - (mask >> 7);
+	v = (v << 1) & 0xfefefefeU;
+	v ^= mask & 0x1d1d1d1dU;
+	return v;
+}
+
+/*
+ * Multiply each byte of a uint64 by 2 in the GF(2^8).
+ */
+static __always_inline uint64_t x2_64(uint64_t v)
+{
+	uint64_t mask = v & 0x8080808080808080ULL;
+
+	mask = (mask << 1) - (mask >> 7);
+	v = (v << 1) & 0xfefefefefefefefeULL;
+	v ^= mask & 0x1d1d1d1d1d1d1d1dULL;
+	return v;
+}
+
+/*
+ * Divide each byte of a uint32 by 2 in the GF(2^8).
+ */
+static __always_inline uint32_t d2_32(uint32_t v)
+{
+	uint32_t mask = v & 0x01010101U;
+
+	mask = (mask << 8) - mask;
+	v = (v >> 1) & 0x7f7f7f7fU;
+	v ^= mask & 0x8e8e8e8eU;
+	return v;
+}
+
+/*
+ * Divide each byte of a uint64 by 2 in the GF(2^8).
+ */
+static __always_inline uint64_t d2_64(uint64_t v)
+{
+	uint64_t mask = v & 0x0101010101010101ULL;
+
+	mask = (mask << 8) - mask;
+	v = (v >> 1) & 0x7f7f7f7f7f7f7f7fULL;
+	v ^= mask & 0x8e8e8e8e8e8e8e8eULL;
+	return v;
+}
+
+#endif
+
--- a/raid/helper.c
+++ b/raid/helper.c
@ -0,0 +1,94 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+
+#define RAID_SWAP(a, b) \
+	do { \
+		if (v[a] > v[b]) { \
+			int t = v[a]; \
+			v[a] = v[b]; \
+			v[b] = t; \
+		} \
+	} while (0)
+
+void raid_sort(int n, int *v)
+{
+	/* sorting networks generated with Batcher's Merge-Exchange */
+	switch (n) {
+	case 2:
+		RAID_SWAP(0, 1);
+		break;
+	case 3:
+		RAID_SWAP(0, 2);
+		RAID_SWAP(0, 1);
+		RAID_SWAP(1, 2);
+		break;
+	case 4:
+		RAID_SWAP(0, 2);
+		RAID_SWAP(1, 3);
+		RAID_SWAP(0, 1);
+		RAID_SWAP(2, 3);
+		RAID_SWAP(1, 2);
+		break;
+	case 5:
+		RAID_SWAP(0, 4);
+		RAID_SWAP(0, 2);
+		RAID_SWAP(1, 3);
+		RAID_SWAP(2, 4);
+		RAID_SWAP(0, 1);
+		RAID_SWAP(2, 3);
+		RAID_SWAP(1, 4);
+		RAID_SWAP(1, 2);
+		RAID_SWAP(3, 4);
+		break;
+	case 6:
+		RAID_SWAP(0, 4);
+		RAID_SWAP(1, 5);
+		RAID_SWAP(0, 2);
+		RAID_SWAP(1, 3);
+		RAID_SWAP(2, 4);
+		RAID_SWAP(3, 5);
+		RAID_SWAP(0, 1);
+		RAID_SWAP(2, 3);
+		RAID_SWAP(4, 5);
+		RAID_SWAP(1, 4);
+		RAID_SWAP(1, 2);
+		RAID_SWAP(3, 4);
+		break;
+	}
+}
+
+void raid_insert(int n, int *v, int i)
+{
+	/* we don't use binary search because this is intended */
+	/* for very small vectors and we want to optimize the case */
+	/* of elements inserted already in order */
+
+	/* insert at the end */
+	v[n] = i;
+
+	/* swap until in the correct position */
+	while (n > 0 && v[n - 1] > v[n]) {
+		/* swap */
+		int t = v[n - 1];
+
+		v[n - 1] = v[n];
+		v[n] = t;
+
+		/* previous position */
+		--n;
+	}
+}
+
--- a/raid/helper.h
+++ b/raid/helper.h
@ -0,0 +1,43 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __RAID_HELPER_H
+#define __RAID_HELPER_H
+
+/**
+ * Inserts an integer in a sorted vector.
+ *
+ * This function can be used to insert indexes in order, ready to be used for
+ * calling raid_rec().
+ *
+ * @n Number of integers currently in the vector.
+ * @v Vector of integers already sorted.
+ *   It must have extra space for the new elemet at the end.
+ * @i Value to insert.
+ */
+void raid_insert(int n, int *v, int i);
+
+/**
+ * Sorts a small vector of integers.
+ *
+ * If you have indexes not in order, you can use this function to sort them
+ * before calling raid_rec().
+ *
+ * @n Number of integers. No more than RAID_PARITY_MAX.
+ * @v Vector of integers.
+ */
+void raid_sort(int n, int *v);
+
+#endif
+
--- a/raid/int.c
+++ b/raid/int.c
@ -0,0 +1,556 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+#include "gf.h"
+
+/*
+ * GEN1 (RAID5 with xor) 32bit C implementation
+ */
+void raid_gen1_int32(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	int d, l;
+	size_t i;
+
+	uint32_t p0;
+	uint32_t p1;
+
+	l = nd - 1;
+	p = v[nd];
+
+	for (i = 0; i < size; i += 8) {
+		p0 = v_32(v[l][i]);
+		p1 = v_32(v[l][i + 4]);
+		for (d = l - 1; d >= 0; --d) {
+			p0 ^= v_32(v[d][i]);
+			p1 ^= v_32(v[d][i + 4]);
+		}
+		v_32(p[i]) = p0;
+		v_32(p[i + 4]) = p1;
+	}
+}
+
+/*
+ * GEN1 (RAID5 with xor) 64bit C implementation
+ */
+void raid_gen1_int64(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	int d, l;
+	size_t i;
+
+	uint64_t p0;
+	uint64_t p1;
+
+	l = nd - 1;
+	p = v[nd];
+
+	for (i = 0; i < size; i += 16) {
+		p0 = v_64(v[l][i]);
+		p1 = v_64(v[l][i + 8]);
+		for (d = l - 1; d >= 0; --d) {
+			p0 ^= v_64(v[d][i]);
+			p1 ^= v_64(v[d][i + 8]);
+		}
+		v_64(p[i]) = p0;
+		v_64(p[i + 8]) = p1;
+	}
+}
+
+/*
+ * GEN2 (RAID6 with powers of 2) 32bit C implementation
+ */
+void raid_gen2_int32(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	int d, l;
+	size_t i;
+
+	uint32_t d0, q0, p0;
+	uint32_t d1, q1, p1;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+
+	for (i = 0; i < size; i += 8) {
+		q0 = p0 = v_32(v[l][i]);
+		q1 = p1 = v_32(v[l][i + 4]);
+		for (d = l - 1; d >= 0; --d) {
+			d0 = v_32(v[d][i]);
+			d1 = v_32(v[d][i + 4]);
+
+			p0 ^= d0;
+			p1 ^= d1;
+
+			q0 = x2_32(q0);
+			q1 = x2_32(q1);
+
+			q0 ^= d0;
+			q1 ^= d1;
+		}
+		v_32(p[i]) = p0;
+		v_32(p[i + 4]) = p1;
+		v_32(q[i]) = q0;
+		v_32(q[i + 4]) = q1;
+	}
+}
+
+/*
+ * GEN2 (RAID6 with powers of 2) 64bit C implementation
+ */
+void raid_gen2_int64(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	int d, l;
+	size_t i;
+
+	uint64_t d0, q0, p0;
+	uint64_t d1, q1, p1;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+
+	for (i = 0; i < size; i += 16) {
+		q0 = p0 = v_64(v[l][i]);
+		q1 = p1 = v_64(v[l][i + 8]);
+		for (d = l - 1; d >= 0; --d) {
+			d0 = v_64(v[d][i]);
+			d1 = v_64(v[d][i + 8]);
+
+			p0 ^= d0;
+			p1 ^= d1;
+
+			q0 = x2_64(q0);
+			q1 = x2_64(q1);
+
+			q0 ^= d0;
+			q1 ^= d1;
+		}
+		v_64(p[i]) = p0;
+		v_64(p[i + 8]) = p1;
+		v_64(q[i]) = q0;
+		v_64(q[i + 8]) = q1;
+	}
+}
+
+/*
+ * GEN3 (triple parity with Cauchy matrix) 8bit C implementation
+ *
+ * Note that instead of a generic multiplication table, likely resulting
+ * in multiple cache misses, a precomputed table could be used.
+ * But this is only a kind of reference function, and we are not really
+ * interested in speed.
+ */
+void raid_gen3_int8(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	int d, l;
+	size_t i;
+
+	uint8_t d0, r0, q0, p0;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+
+	for (i = 0; i < size; i += 1) {
+		p0 = q0 = r0 = 0;
+		for (d = l; d > 0; --d) {
+			d0 = v_8(v[d][i]);
+
+			p0 ^= d0;
+			q0 ^= gfmul[d0][gfgen[1][d]];
+			r0 ^= gfmul[d0][gfgen[2][d]];
+		}
+
+		/* first disk with all coefficients at 1 */
+		d0 = v_8(v[0][i]);
+
+		p0 ^= d0;
+		q0 ^= d0;
+		r0 ^= d0;
+
+		v_8(p[i]) = p0;
+		v_8(q[i]) = q0;
+		v_8(r[i]) = r0;
+	}
+}
+
+/*
+ * GEN4 (quad parity with Cauchy matrix) 8bit C implementation
+ *
+ * Note that instead of a generic multiplication table, likely resulting
+ * in multiple cache misses, a precomputed table could be used.
+ * But this is only a kind of reference function, and we are not really
+ * interested in speed.
+ */
+void raid_gen4_int8(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	uint8_t *s;
+	int d, l;
+	size_t i;
+
+	uint8_t d0, s0, r0, q0, p0;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+	s = v[nd + 3];
+
+	for (i = 0; i < size; i += 1) {
+		p0 = q0 = r0 = s0 = 0;
+		for (d = l; d > 0; --d) {
+			d0 = v_8(v[d][i]);
+
+			p0 ^= d0;
+			q0 ^= gfmul[d0][gfgen[1][d]];
+			r0 ^= gfmul[d0][gfgen[2][d]];
+			s0 ^= gfmul[d0][gfgen[3][d]];
+		}
+
+		/* first disk with all coefficients at 1 */
+		d0 = v_8(v[0][i]);
+
+		p0 ^= d0;
+		q0 ^= d0;
+		r0 ^= d0;
+		s0 ^= d0;
+
+		v_8(p[i]) = p0;
+		v_8(q[i]) = q0;
+		v_8(r[i]) = r0;
+		v_8(s[i]) = s0;
+	}
+}
+
+/*
+ * GEN5 (penta parity with Cauchy matrix) 8bit C implementation
+ *
+ * Note that instead of a generic multiplication table, likely resulting
+ * in multiple cache misses, a precomputed table could be used.
+ * But this is only a kind of reference function, and we are not really
+ * interested in speed.
+ */
+void raid_gen5_int8(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	uint8_t *s;
+	uint8_t *t;
+	int d, l;
+	size_t i;
+
+	uint8_t d0, t0, s0, r0, q0, p0;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+	s = v[nd + 3];
+	t = v[nd + 4];
+
+	for (i = 0; i < size; i += 1) {
+		p0 = q0 = r0 = s0 = t0 = 0;
+		for (d = l; d > 0; --d) {
+			d0 = v_8(v[d][i]);
+
+			p0 ^= d0;
+			q0 ^= gfmul[d0][gfgen[1][d]];
+			r0 ^= gfmul[d0][gfgen[2][d]];
+			s0 ^= gfmul[d0][gfgen[3][d]];
+			t0 ^= gfmul[d0][gfgen[4][d]];
+		}
+
+		/* first disk with all coefficients at 1 */
+		d0 = v_8(v[0][i]);
+
+		p0 ^= d0;
+		q0 ^= d0;
+		r0 ^= d0;
+		s0 ^= d0;
+		t0 ^= d0;
+
+		v_8(p[i]) = p0;
+		v_8(q[i]) = q0;
+		v_8(r[i]) = r0;
+		v_8(s[i]) = s0;
+		v_8(t[i]) = t0;
+	}
+}
+
+/*
+ * GEN6 (hexa parity with Cauchy matrix) 8bit C implementation
+ *
+ * Note that instead of a generic multiplication table, likely resulting
+ * in multiple cache misses, a precomputed table could be used.
+ * But this is only a kind of reference function, and we are not really
+ * interested in speed.
+ */
+void raid_gen6_int8(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	uint8_t *s;
+	uint8_t *t;
+	uint8_t *u;
+	int d, l;
+	size_t i;
+
+	uint8_t d0, u0, t0, s0, r0, q0, p0;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+	s = v[nd + 3];
+	t = v[nd + 4];
+	u = v[nd + 5];
+
+	for (i = 0; i < size; i += 1) {
+		p0 = q0 = r0 = s0 = t0 = u0 = 0;
+		for (d = l; d > 0; --d) {
+			d0 = v_8(v[d][i]);
+
+			p0 ^= d0;
+			q0 ^= gfmul[d0][gfgen[1][d]];
+			r0 ^= gfmul[d0][gfgen[2][d]];
+			s0 ^= gfmul[d0][gfgen[3][d]];
+			t0 ^= gfmul[d0][gfgen[4][d]];
+			u0 ^= gfmul[d0][gfgen[5][d]];
+		}
+
+		/* first disk with all coefficients at 1 */
+		d0 = v_8(v[0][i]);
+
+		p0 ^= d0;
+		q0 ^= d0;
+		r0 ^= d0;
+		s0 ^= d0;
+		t0 ^= d0;
+		u0 ^= d0;
+
+		v_8(p[i]) = p0;
+		v_8(q[i]) = q0;
+		v_8(r[i]) = r0;
+		v_8(s[i]) = s0;
+		v_8(t[i]) = t0;
+		v_8(u[i]) = u0;
+	}
+}
+
+/*
+ * Recover failure of one data block at index id[0] using parity at index
+ * ip[0] for any RAID level.
+ *
+ * Starting from the equation:
+ *
+ * Pd = A[ip[0],id[0]] * Dx
+ *
+ * and solving we get:
+ *
+ * Dx = A[ip[0],id[0]]^-1 * Pd
+ */
+void raid_rec1_int8(int nr, int *id, int *ip, int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *pa;
+	const uint8_t *T;
+	uint8_t G;
+	uint8_t V;
+	size_t i;
+
+	(void)nr; /* unused, it's always 1 */
+
+	/* if it's RAID5 uses the faster function */
+	if (ip[0] == 0) {
+		raid_rec1of1(id, nd, size, vv);
+		return;
+	}
+
+	/* setup the coefficients matrix */
+	G = A(ip[0], id[0]);
+
+	/* invert it to solve the system of linear equations */
+	V = inv(G);
+
+	/* get multiplication tables */
+	T = table(V);
+
+	/* compute delta parity */
+	raid_delta_gen(1, id, ip, nd, size, vv);
+
+	p = v[nd + ip[0]];
+	pa = v[id[0]];
+
+	for (i = 0; i < size; ++i) {
+		/* delta */
+		uint8_t Pd = p[i] ^ pa[i];
+
+		/* reconstruct */
+		pa[i] = T[Pd];
+	}
+}
+
+/*
+ * Recover failure of two data blocks at indexes id[0],id[1] using parity at
+ * indexes ip[0],ip[1] for any RAID level.
+ *
+ * Starting from the equations:
+ *
+ * Pd = A[ip[0],id[0]] * Dx + A[ip[0],id[1]] * Dy
+ * Qd = A[ip[1],id[0]] * Dx + A[ip[1],id[1]] * Dy
+ *
+ * we solve inverting the coefficients matrix.
+ */
+void raid_rec2_int8(int nr, int *id, int *ip, int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p;
+	uint8_t *pa;
+	uint8_t *q;
+	uint8_t *qa;
+	const int N = 2;
+	const uint8_t *T[N][N];
+	uint8_t G[N * N];
+	uint8_t V[N * N];
+	size_t i;
+	int j, k;
+
+	(void)nr; /* unused, it's always 2 */
+
+	/* if it's RAID6 recovering with P and Q uses the faster function */
+	if (ip[0] == 0 && ip[1] == 1) {
+		raid_rec2of2_int8(id, ip, nd, size, vv);
+		return;
+	}
+
+	/* setup the coefficients matrix */
+	for (j = 0; j < N; ++j)
+		for (k = 0; k < N; ++k)
+			G[j * N + k] = A(ip[j], id[k]);
+
+	/* invert it to solve the system of linear equations */
+	raid_invert(G, V, N);
+
+	/* get multiplication tables */
+	for (j = 0; j < N; ++j)
+		for (k = 0; k < N; ++k)
+			T[j][k] = table(V[j * N + k]);
+
+	/* compute delta parity */
+	raid_delta_gen(2, id, ip, nd, size, vv);
+
+	p = v[nd + ip[0]];
+	q = v[nd + ip[1]];
+	pa = v[id[0]];
+	qa = v[id[1]];
+
+	for (i = 0; i < size; ++i) {
+		/* delta */
+		uint8_t Pd = p[i] ^ pa[i];
+		uint8_t Qd = q[i] ^ qa[i];
+
+		/* reconstruct */
+		pa[i] = T[0][0][Pd] ^ T[0][1][Qd];
+		qa[i] = T[1][0][Pd] ^ T[1][1][Qd];
+	}
+}
+
+/*
+ * Recover failure of N data blocks at indexes id[N] using parity at indexes
+ * ip[N] for any RAID level.
+ *
+ * Starting from the N equations, with 0<=i<N :
+ *
+ * PD[i] = sum(A[ip[i],id[j]] * D[i]) 0<=j<N
+ *
+ * we solve inverting the coefficients matrix.
+ *
+ * Note that referring at previous equations you have:
+ * PD[0] = Pd, PD[1] = Qd, PD[2] = Rd, ...
+ * D[0] = Dx, D[1] = Dy, D[2] = Dz, ...
+ */
+void raid_recX_int8(int nr, int *id, int *ip, int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	uint8_t *p[RAID_PARITY_MAX];
+	uint8_t *pa[RAID_PARITY_MAX];
+	const uint8_t *T[RAID_PARITY_MAX][RAID_PARITY_MAX];
+	uint8_t G[RAID_PARITY_MAX * RAID_PARITY_MAX];
+	uint8_t V[RAID_PARITY_MAX * RAID_PARITY_MAX];
+	size_t i;
+	int j, k;
+
+	/* setup the coefficients matrix */
+	for (j = 0; j < nr; ++j)
+		for (k = 0; k < nr; ++k)
+			G[j * nr + k] = A(ip[j], id[k]);
+
+	/* invert it to solve the system of linear equations */
+	raid_invert(G, V, nr);
+
+	/* get multiplication tables */
+	for (j = 0; j < nr; ++j)
+		for (k = 0; k < nr; ++k)
+			T[j][k] = table(V[j * nr + k]);
+
+	/* compute delta parity */
+	raid_delta_gen(nr, id, ip, nd, size, vv);
+
+	for (j = 0; j < nr; ++j) {
+		p[j] = v[nd + ip[j]];
+		pa[j] = v[id[j]];
+	}
+
+	for (i = 0; i < size; ++i) {
+		uint8_t PD[RAID_PARITY_MAX];
+
+		/* delta */
+		for (j = 0; j < nr; ++j)
+			PD[j] = p[j][i] ^ pa[j][i];
+
+		/* reconstruct */
+		for (j = 0; j < nr; ++j) {
+			uint8_t b = 0;
+
+			for (k = 0; k < nr; ++k)
+				b ^= T[j][k][PD[k]];
+			pa[j][i] = b;
+		}
+	}
+}
+
--- a/raid/internal.h
+++ b/raid/internal.h
@ -0,0 +1,274 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __RAID_INTERNAL_H
+#define __RAID_INTERNAL_H
+
+/*
+ * Supported instruction sets.
+ *
+ * It may happen that the assembler is too old to support
+ * all instructions, even if the architecture supports them.
+ * These defines allow to exclude from the build the not supported ones.
+ *
+ * If in your project you use a predefined assembler, you can define them
+ * using fixed values, instead of using the HAVE_* defines.
+ */
+#if HAVE_CONFIG_H
+
+/* Includes the project configuration for HAVE_* defines */
+#include "config.h"
+
+/* If the compiler supports assembly */
+#if HAVE_ASSEMBLY
+/* Autodetect from the compiler */
+#if defined(__i386__)
+#define CONFIG_X86 1
+#define CONFIG_X86_32 1
+#endif
+#if defined(__x86_64__)
+#define CONFIG_X86 1
+#define CONFIG_X86_64 1
+#endif
+#endif
+
+/* Enables SSE2, SSSE3, AVX2 only if the assembler supports it */
+#if HAVE_SSE2
+#define CONFIG_SSE2 1
+#endif
+#if HAVE_SSSE3
+#define CONFIG_SSSE3 1
+#endif
+#if HAVE_AVX2
+#define CONFIG_AVX2 1
+#endif
+
+#else /* if HAVE_CONFIG_H is not defined */
+
+/* Assume that assembly is always supported */
+#if defined(__i386__)
+#define CONFIG_X86 1
+#define CONFIG_X86_32 1
+#endif
+
+#if defined(__x86_64__)
+#define CONFIG_X86 1
+#define CONFIG_X86_64 1
+#endif
+
+/* Assumes that the assembler supports everything */
+#ifdef CONFIG_X86
+#define CONFIG_SSE2 1
+#define CONFIG_SSSE3 1
+#define CONFIG_AVX2 1
+#endif
+#endif
+
+/*
+ * Includes anything required for compatibility.
+ */
+#include <assert.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+/*
+ * Inverse assert.
+ */
+#define BUG_ON(a) assert(!(a))
+
+/*
+ * Forced inline.
+ */
+#ifndef __always_inline
+#define __always_inline inline __attribute__((always_inline))
+#endif
+
+/*
+ * Forced alignment.
+ */
+#ifndef __aligned
+#define __aligned(a) __attribute__((aligned(a)))
+#endif
+
+/*
+ * Align a pointer at the specified size.
+ */
+static __always_inline void *__align_ptr(void *ptr, uintptr_t size)
+{
+	uintptr_t offset = (uintptr_t)ptr;
+
+	offset = (offset + size - 1U) & ~(size - 1U);
+
+	return (void *)offset;
+}
+
+/*
+ * Includes the main interface headers.
+ */
+#include "raid.h"
+#include "helper.h"
+
+/*
+ * Internal functions.
+ *
+ * These are intended to provide access for testing.
+ */
+int raid_selftest(void);
+void raid_gen_ref(int nd, int np, size_t size, void **vv);
+void raid_invert(uint8_t *M, uint8_t *V, int n);
+void raid_delta_gen(int nr, int *id, int *ip, int nd, size_t size, void **v);
+void raid_rec1of1(int *id, int nd, size_t size, void **v);
+void raid_rec2of2_int8(int *id, int *ip, int nd, size_t size, void **vv);
+void raid_gen1_int32(int nd, size_t size, void **vv);
+void raid_gen1_int64(int nd, size_t size, void **vv);
+void raid_gen1_sse2(int nd, size_t size, void **vv);
+void raid_gen1_avx2(int nd, size_t size, void **vv);
+void raid_gen2_int32(int nd, size_t size, void **vv);
+void raid_gen2_int64(int nd, size_t size, void **vv);
+void raid_gen2_sse2(int nd, size_t size, void **vv);
+void raid_gen2_avx2(int nd, size_t size, void **vv);
+void raid_gen2_sse2ext(int nd, size_t size, void **vv);
+void raid_genz_int32(int nd, size_t size, void **vv);
+void raid_genz_int64(int nd, size_t size, void **vv);
+void raid_genz_sse2(int nd, size_t size, void **vv);
+void raid_genz_sse2ext(int nd, size_t size, void **vv);
+void raid_genz_avx2ext(int nd, size_t size, void **vv);
+void raid_gen3_int8(int nd, size_t size, void **vv);
+void raid_gen3_ssse3(int nd, size_t size, void **vv);
+void raid_gen3_ssse3ext(int nd, size_t size, void **vv);
+void raid_gen3_avx2ext(int nd, size_t size, void **vv);
+void raid_gen4_int8(int nd, size_t size, void **vv);
+void raid_gen4_ssse3(int nd, size_t size, void **vv);
+void raid_gen4_ssse3ext(int nd, size_t size, void **vv);
+void raid_gen4_avx2ext(int nd, size_t size, void **vv);
+void raid_gen5_int8(int nd, size_t size, void **vv);
+void raid_gen5_ssse3(int nd, size_t size, void **vv);
+void raid_gen5_ssse3ext(int nd, size_t size, void **vv);
+void raid_gen5_avx2ext(int nd, size_t size, void **vv);
+void raid_gen6_int8(int nd, size_t size, void **vv);
+void raid_gen6_ssse3(int nd, size_t size, void **vv);
+void raid_gen6_ssse3ext(int nd, size_t size, void **vv);
+void raid_gen6_avx2ext(int nd, size_t size, void **vv);
+void raid_rec1_int8(int nr, int *id, int *ip, int nd, size_t size, void **vv);
+void raid_rec2_int8(int nr, int *id, int *ip, int nd, size_t size, void **vv);
+void raid_recX_int8(int nr, int *id, int *ip, int nd, size_t size, void **vv);
+void raid_rec1_ssse3(int nr, int *id, int *ip, int nd, size_t size, void **vv);
+void raid_rec2_ssse3(int nr, int *id, int *ip, int nd, size_t size, void **vv);
+void raid_recX_ssse3(int nr, int *id, int *ip, int nd, size_t size, void **vv);
+void raid_rec1_avx2(int nr, int *id, int *ip, int nd, size_t size, void **vv);
+void raid_rec2_avx2(int nr, int *id, int *ip, int nd, size_t size, void **vv);
+void raid_recX_avx2(int nr, int *id, int *ip, int nd, size_t size, void **vv);
+
+/*
+ * Internal naming.
+ *
+ * These are intented to provide access for testing.
+ */
+const char *raid_gen1_tag(void);
+const char *raid_gen2_tag(void);
+const char *raid_genz_tag(void);
+const char *raid_gen3_tag(void);
+const char *raid_gen4_tag(void);
+const char *raid_gen5_tag(void);
+const char *raid_gen6_tag(void);
+const char *raid_rec1_tag(void);
+const char *raid_rec2_tag(void);
+const char *raid_recX_tag(void);
+
+/*
+ * Internal forwarders.
+ */
+extern void (*raid_gen3_ptr)(int nd, size_t size, void **vv);
+extern void (*raid_genz_ptr)(int nd, size_t size, void **vv);
+extern void (*raid_gen_ptr[RAID_PARITY_MAX])(
+	int nd, size_t size, void **vv);
+extern void (*raid_rec_ptr[RAID_PARITY_MAX])(
+	int nr, int *id, int *ip, int nd, size_t size, void **vv);
+
+/*
+ * Tables.
+ */
+extern const uint8_t raid_gfmul[256][256] __aligned(256);
+extern const uint8_t raid_gfexp[256] __aligned(256);
+extern const uint8_t raid_gfinv[256] __aligned(256);
+extern const uint8_t raid_gfvandermonde[3][256] __aligned(256);
+extern const uint8_t raid_gfcauchy[6][256] __aligned(256);
+extern const uint8_t raid_gfcauchypshufb[251][4][2][16] __aligned(256);
+extern const uint8_t raid_gfmulpshufb[256][2][16] __aligned(256);
+extern const uint8_t (*raid_gfgen)[256];
+#define gfmul raid_gfmul
+#define gfexp raid_gfexp
+#define gfinv raid_gfinv
+#define gfvandermonde raid_gfvandermonde
+#define gfcauchy raid_gfcauchy
+#define gfgenpshufb raid_gfcauchypshufb
+#define gfmulpshufb raid_gfmulpshufb
+#define gfgen raid_gfgen
+
+/*
+ * Assembler blocks.
+ */
+#ifdef CONFIG_X86
+#ifdef CONFIG_SSE2
+static __always_inline void raid_sse_begin(void)
+{
+}
+
+static __always_inline void raid_sse_end(void)
+{
+	/* SSE and AVX code uses non-temporal writes, like MOVNTDQ, */
+	/* that use a weak memory model. To ensure that other processors */
+	/* see correctly the data written, we use a store-store memory */
+	/* barrier at the end of the asm code */
+	asm volatile ("sfence" : : : "memory");
+
+	/* clobbers registers used in the asm code */
+	/* this is required because in the Windows ABI, */
+	/* registers xmm6-xmm15 should be kept by the callee. */
+	/* this clobber list force the compiler to save any */
+	/* register that needs to be saved */
+	/* we check for __SSE2_ because we require that the */
+	/* compiler supports SSE2 registers in the clobber list */
+#ifdef __SSE2__
+	asm volatile ("" : : : "%xmm0", "%xmm1", "%xmm2", "%xmm3");
+	asm volatile ("" : : : "%xmm4", "%xmm5", "%xmm6", "%xmm7");
+#ifdef CONFIG_X86_64
+	asm volatile ("" : : : "%xmm8", "%xmm9", "%xmm10", "%xmm11");
+	asm volatile ("" : : : "%xmm12", "%xmm13", "%xmm14", "%xmm15");
+#endif
+#endif
+}
+#endif
+
+#ifdef CONFIG_AVX2
+static __always_inline void raid_avx_begin(void)
+{
+	raid_sse_begin();
+}
+
+static __always_inline void raid_avx_end(void)
+{
+	raid_sse_end();
+
+	/* reset the upper part of the ymm registers */
+	/* to avoid the 70 clocks penality on the next */
+	/* xmm register use */
+	asm volatile ("vzeroupper" : : : "memory");
+}
+#endif
+#endif /* CONFIG_X86 */
+
+#endif
+
--- a/raid/intz.c
+++ b/raid/intz.c
@ -0,0 +1,119 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+#include "gf.h"
+
+/*
+ * GENz (triple parity with powers of 2^-1) 32bit C implementation
+ */
+void raid_genz_int32(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t**)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	int d, l;
+	size_t i;
+
+	uint32_t d0, r0, q0, p0;
+	uint32_t d1, r1, q1, p1;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+
+	for (i = 0; i < size; i += 8) {
+		r0 = q0 = p0 = v_32(v[l][i]);
+		r1 = q1 = p1 = v_32(v[l][i + 4]);
+		for (d = l - 1; d >= 0; --d) {
+			d0 = v_32(v[d][i]);
+			d1 = v_32(v[d][i + 4]);
+
+			p0 ^= d0;
+			p1 ^= d1;
+
+			q0 = x2_32(q0);
+			q1 = x2_32(q1);
+
+			q0 ^= d0;
+			q1 ^= d1;
+
+			r0 = d2_32(r0);
+			r1 = d2_32(r1);
+
+			r0 ^= d0;
+			r1 ^= d1;
+		}
+		v_32(p[i]) = p0;
+		v_32(p[i + 4]) = p1;
+		v_32(q[i]) = q0;
+		v_32(q[i + 4]) = q1;
+		v_32(r[i]) = r0;
+		v_32(r[i + 4]) = r1;
+	}
+}
+
+/*
+ * GENz (triple parity with powers of 2^-1) 64bit C implementation
+ */
+void raid_genz_int64(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t**)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	int d, l;
+	size_t i;
+
+	uint64_t d0, r0, q0, p0;
+	uint64_t d1, r1, q1, p1;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+
+	for (i = 0; i < size; i += 16) {
+		r0 = q0 = p0 = v_64(v[l][i]);
+		r1 = q1 = p1 = v_64(v[l][i + 8]);
+		for (d = l - 1; d >= 0; --d) {
+			d0 = v_64(v[d][i]);
+			d1 = v_64(v[d][i + 8]);
+
+			p0 ^= d0;
+			p1 ^= d1;
+
+			q0 = x2_64(q0);
+			q1 = x2_64(q1);
+
+			q0 ^= d0;
+			q1 ^= d1;
+
+			r0 = d2_64(r0);
+			r1 = d2_64(r1);
+
+			r0 ^= d0;
+			r1 ^= d1;
+		}
+		v_64(p[i]) = p0;
+		v_64(p[i + 8]) = p1;
+		v_64(q[i]) = q0;
+		v_64(q[i + 8]) = q1;
+		v_64(r[i]) = r0;
+		v_64(r[i + 8]) = r1;
+	}
+}
+
--- a/raid/memory.c
+++ b/raid/memory.c
@ -0,0 +1,154 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+#include "memory.h"
+
+void *raid_malloc_align(size_t size, size_t align_size, void **freeptr)
+{
+	unsigned char *ptr;
+	uintptr_t offset;
+
+	ptr = malloc(size + align_size);
+	if (!ptr) {
+		/* LCOV_EXCL_START */
+		return 0;
+		/* LCOV_EXCL_STOP */
+	}
+
+	*freeptr = ptr;
+
+	offset = ((uintptr_t)ptr) % align_size;
+
+	if (offset != 0)
+		ptr += align_size - offset;
+
+	return ptr;
+}
+
+void *raid_malloc(size_t size, void **freeptr)
+{
+    return raid_malloc_align(size, RAID_MALLOC_ALIGN, freeptr);
+}
+
+void **raid_malloc_vector_align(int nd, int n, size_t size, size_t align_size, size_t displacement_size, void **freeptr)
+{
+	void **v;
+	unsigned char *va;
+	int i;
+
+	BUG_ON(n <= 0 || nd < 0);
+
+	v = malloc(n * sizeof(void *));
+	if (!v) {
+		/* LCOV_EXCL_START */
+		return 0;
+		/* LCOV_EXCL_STOP */
+	}
+
+	va = raid_malloc_align(n * (size + displacement_size), align_size, freeptr);
+	if (!va) {
+		/* LCOV_EXCL_START */
+		free(v);
+		return 0;
+		/* LCOV_EXCL_STOP */
+	}
+
+	for (i = 0; i < n; ++i) {
+		v[i] = va;
+		va += size + displacement_size;
+	}
+
+	/* reverse order of the data blocks */
+	/* because they are usually accessed from the last one */
+	for (i = 0; i < nd / 2; ++i) {
+		void *ptr = v[i];
+
+		v[i] = v[nd - 1 - i];
+		v[nd - 1 - i] = ptr;
+	}
+
+	return v;
+}
+
+void **raid_malloc_vector(int nd, int n, size_t size, void **freeptr)
+{
+    return raid_malloc_vector_align(nd, n, size, RAID_MALLOC_ALIGN, RAID_MALLOC_DISPLACEMENT, freeptr);
+}
+
+void raid_mrand_vector(unsigned seed, int n, size_t size, void **vv)
+{
+	unsigned char **v = (unsigned char **)vv;
+	int i;
+	size_t j;
+
+	for (i = 0; i < n; ++i)
+		for (j = 0; j < size; ++j) {
+			/* basic C99/C11 linear congruential generator */
+			seed = seed * 1103515245U + 12345U;
+
+			v[i][j] = seed >> 16;
+		}
+}
+
+int raid_mtest_vector(int n, size_t size, void **vv)
+{
+	unsigned char **v = (unsigned char **)vv;
+	int i;
+	size_t j;
+	unsigned k;
+	unsigned char d;
+	unsigned char p;
+
+	/* fill with 0 */
+	d = 0;
+	for (i = 0; i < n; ++i)
+		for (j = 0; j < size; ++j)
+			v[i][j] = d;
+
+	/* test with all the byte patterns */
+	for (k = 1; k < 256; ++k) {
+		p = d;
+		d = k;
+
+		/* forward fill */
+		for (i = 0; i < n; ++i) {
+			for (j = 0; j < size; ++j) {
+				if (v[i][j] != p) {
+					/* LCOV_EXCL_START */
+					return -1;
+					/* LCOV_EXCL_STOP */
+				}
+				v[i][j] = d;
+			}
+		}
+
+		p = d;
+		d = ~p;
+		/* backward fill with complement */
+		for (i = 0; i < n; ++i) {
+			for (j = size; j > 0; --j) {
+				if (v[i][j - 1] != p) {
+					/* LCOV_EXCL_START */
+					return -1;
+					/* LCOV_EXCL_STOP */
+				}
+				v[i][j - 1] = d;
+			}
+		}
+	}
+
+	return 0;
+}
+
--- a/raid/memory.h
+++ b/raid/memory.h
@ -0,0 +1,96 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __RAID_MEMORY_H
+#define __RAID_MEMORY_H
+
+/**
+ * Memory alignment provided by raid_malloc().
+ *
+ * It should guarantee good cache performance everywhere.
+ */
+#define RAID_MALLOC_ALIGN 256
+
+/**
+ * Memory displacement to avoid cache address sharing on contiguous blocks,
+ * used by raid_malloc_vector().
+ *
+ * When allocating a sequence of blocks with a size of power of 2,
+ * there is the risk that the addresses of each block are mapped into the
+ * same cache line and prefetching predictor, resulting in a lot of cache
+ * sharing if you access all the blocks in parallel, from the start to the
+ * end.
+ *
+ * To avoid this effect, it's better if all the blocks are allocated
+ * with a fixed displacement trying to reduce the cache addresses sharing.
+ *
+ * The selected displacement was chosen empirically with some speed tests
+ * with 8/12/16/20/24 data buffers of 256 KB.
+ *
+ * These are the results in MB/s with no displacement:
+ *
+ *            sse2
+ *    gen1   15368 [MB/s]
+ *    gen2    6814 [MB/s]
+ *    genz    3033 [MB/s]
+ *
+ * These are the results with displacement resulting in improvments
+ * in the order of 20% or more:
+ *
+ *            sse2
+ *    gen1   21936 [MB/s]
+ *    gen2   11902 [MB/s]
+ *    genz    5838 [MB/s]
+ *
+ */
+#define RAID_MALLOC_DISPLACEMENT (7*256)
+
+/**
+ * Aligned malloc.
+ * Use an alignment suitable for the raid functions.
+ */
+void *raid_malloc(size_t size, void **freeptr);
+
+/**
+ * Arbitrary aligned malloc.
+ */
+void *raid_malloc_align(size_t size, size_t align_size, void **freeptr);
+
+/**
+ * Aligned vector allocation.
+ * Use an alignment suitable for the raid functions.
+ * Returns a vector of @n pointers, each one pointing to a block of
+ * the specified @size.
+ * The first @nd elements are reversed in order.
+ */
+void **raid_malloc_vector(int nd, int n, size_t size, void **freeptr);
+
+/**
+ * Arbitrary aligned vector allocation.
+ */
+void **raid_malloc_vector_align(int nd, int n, size_t size, size_t align_size, size_t displacement_size, void **freeptr);
+
+/**
+ * Fills the memory vector with pseudo-random data based on the specified seed.
+ */
+void raid_mrand_vector(unsigned seed, int n, size_t size, void **vv);
+
+/**
+ * Tests the memory vector for RAM problems.
+ * If a problem is found, it crashes.
+ */
+int raid_mtest_vector(int n, size_t size, void **vv);
+
+#endif
+
--- a/raid/module.c
+++ b/raid/module.c
@ -0,0 +1,473 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+#include "memory.h"
+#include "cpu.h"
+
+/*
+ * Initializes and selects the best algorithm.
+ */
+void raid_init(void)
+{
+	raid_gen3_ptr = raid_gen3_int8;
+	raid_gen_ptr[3] = raid_gen4_int8;
+	raid_gen_ptr[4] = raid_gen5_int8;
+	raid_gen_ptr[5] = raid_gen6_int8;
+
+	if (sizeof(void *) == 4) {
+		raid_gen_ptr[0] = raid_gen1_int32;
+		raid_gen_ptr[1] = raid_gen2_int32;
+		raid_genz_ptr = raid_genz_int32;
+	} else {
+		raid_gen_ptr[0] = raid_gen1_int64;
+		raid_gen_ptr[1] = raid_gen2_int64;
+		raid_genz_ptr = raid_genz_int64;
+	}
+
+	raid_rec_ptr[0] = raid_rec1_int8;
+	raid_rec_ptr[1] = raid_rec2_int8;
+	raid_rec_ptr[2] = raid_recX_int8;
+	raid_rec_ptr[3] = raid_recX_int8;
+	raid_rec_ptr[4] = raid_recX_int8;
+	raid_rec_ptr[5] = raid_recX_int8;
+
+#ifdef CONFIG_X86
+#ifdef CONFIG_SSE2
+	if (raid_cpu_has_sse2()) {
+		raid_gen_ptr[0] = raid_gen1_sse2;
+#ifdef CONFIG_X86_64
+		if (raid_cpu_has_slowextendedreg()) {
+			raid_gen_ptr[1] = raid_gen2_sse2;
+		} else {
+			raid_gen_ptr[1] = raid_gen2_sse2ext;
+		}
+		/* note that raid_cpu_has_slowextendedreg() doesn't affect parz */
+		raid_genz_ptr = raid_genz_sse2ext;
+#else
+		raid_gen_ptr[1] = raid_gen2_sse2;
+		raid_genz_ptr = raid_genz_sse2;
+#endif
+	}
+#endif
+
+#ifdef CONFIG_SSSE3
+	if (raid_cpu_has_ssse3()) {
+#ifdef CONFIG_X86_64
+		if (raid_cpu_has_slowextendedreg()) {
+			raid_gen3_ptr = raid_gen3_ssse3;
+			raid_gen_ptr[3] = raid_gen4_ssse3;
+			raid_gen_ptr[4] = raid_gen5_ssse3;
+			raid_gen_ptr[5] = raid_gen6_ssse3;
+		} else {
+			raid_gen3_ptr = raid_gen3_ssse3ext;
+			raid_gen_ptr[3] = raid_gen4_ssse3ext;
+			raid_gen_ptr[4] = raid_gen5_ssse3ext;
+			raid_gen_ptr[5] = raid_gen6_ssse3ext;
+		}
+#else
+		raid_gen3_ptr = raid_gen3_ssse3;
+		raid_gen_ptr[3] = raid_gen4_ssse3;
+		raid_gen_ptr[4] = raid_gen5_ssse3;
+		raid_gen_ptr[5] = raid_gen6_ssse3;
+#endif
+		raid_rec_ptr[0] = raid_rec1_ssse3;
+		raid_rec_ptr[1] = raid_rec2_ssse3;
+		raid_rec_ptr[2] = raid_recX_ssse3;
+		raid_rec_ptr[3] = raid_recX_ssse3;
+		raid_rec_ptr[4] = raid_recX_ssse3;
+		raid_rec_ptr[5] = raid_recX_ssse3;
+	}
+#endif
+
+#ifdef CONFIG_AVX2
+	if (raid_cpu_has_avx2()) {
+		raid_gen_ptr[0] = raid_gen1_avx2;
+		raid_gen_ptr[1] = raid_gen2_avx2;
+#ifdef CONFIG_X86_64
+		raid_gen3_ptr = raid_gen3_avx2ext;
+		raid_genz_ptr = raid_genz_avx2ext;
+		raid_gen_ptr[3] = raid_gen4_avx2ext;
+		raid_gen_ptr[4] = raid_gen5_avx2ext;
+		raid_gen_ptr[5] = raid_gen6_avx2ext;
+#endif
+		raid_rec_ptr[0] = raid_rec1_avx2;
+		raid_rec_ptr[1] = raid_rec2_avx2;
+		raid_rec_ptr[2] = raid_recX_avx2;
+		raid_rec_ptr[3] = raid_recX_avx2;
+		raid_rec_ptr[4] = raid_recX_avx2;
+		raid_rec_ptr[5] = raid_recX_avx2;
+	}
+#endif
+#endif /* CONFIG_X86 */
+
+	/* set the default mode */
+	raid_mode(RAID_MODE_CAUCHY);
+}
+
+/*
+ * Reference parity computation.
+ */
+void raid_gen_ref(int nd, int np, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	size_t i;
+
+	for (i = 0; i < size; ++i) {
+		uint8_t p[RAID_PARITY_MAX];
+		int j, d;
+
+		for (j = 0; j < np; ++j)
+			p[j] = 0;
+
+		for (d = 0; d < nd; ++d) {
+			uint8_t b = v[d][i];
+
+			for (j = 0; j < np; ++j)
+				p[j] ^= gfmul[b][gfgen[j][d]];
+		}
+
+		for (j = 0; j < np; ++j)
+			v[nd + j][i] = p[j];
+	}
+}
+
+/*
+ * Size of the blocks to test.
+ */
+#define TEST_SIZE 4096
+
+/*
+ * Number of data blocks to test.
+ */
+#define TEST_COUNT (65536 / TEST_SIZE)
+
+/*
+ * Parity generation test.
+ */
+static int raid_test_par(int nd, int np, size_t size, void **v, void **ref)
+{
+	int i;
+	void *t[TEST_COUNT + RAID_PARITY_MAX];
+
+	/* setup data */
+	for (i = 0; i < nd; ++i)
+		t[i] = ref[i];
+
+	/* setup parity */
+	for (i = 0; i < np; ++i)
+		t[nd + i] = v[nd + i];
+
+	raid_gen(nd, np, size, t);
+
+	/* compare parity */
+	for (i = 0; i < np; ++i) {
+		if (memcmp(t[nd + i], ref[nd + i], size) != 0) {
+			/* LCOV_EXCL_START */
+			return -1;
+			/* LCOV_EXCL_STOP */
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Recovering test.
+ */
+static int raid_test_rec(int nr, int *ir, int nd, int np, size_t size, void **v, void **ref)
+{
+	int i, j;
+	void *t[TEST_COUNT + RAID_PARITY_MAX];
+
+	/* setup data and parity vector */
+	for (i = 0, j = 0; i < nd + np; ++i) {
+		if (j < nr && ir[j] == i) {
+			/* this block has to be recovered */
+			t[i] = v[i];
+			++j;
+		} else {
+			/* this block is used for recovering */
+			t[i] = ref[i];
+		}
+	}
+
+	raid_rec(nr, ir, nd, np, size, t);
+
+	/* compare all data and parity */
+	for (i = 0; i < nd + np; ++i) {
+		if (t[i] != ref[i]
+			&& memcmp(t[i], ref[i], size) != 0) {
+			/* LCOV_EXCL_START */
+			return -1;
+			/* LCOV_EXCL_STOP */
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Recovering test for data.
+ */
+static int raid_test_data(int nr, int *id, int *ip, int nd, int np, size_t size, void **v, void **ref)
+{
+	int i, j;
+	void *t[TEST_COUNT + RAID_PARITY_MAX];
+
+	/* setup data vector */
+	for (i = 0, j = 0; i < nd; ++i) {
+		if (j < nr && id[j] == i) {
+			/* this block has to be recovered */
+			t[i] = v[i];
+			++j;
+		} else {
+			/* this block is left unchanged */
+			t[i] = ref[i];
+		}
+	}
+
+	/* setup parity vector */
+	for (i = 0, j = 0; i < np; ++i) {
+		if (j < nr && ip[j] == i) {
+			/* this block is used for recovering */
+			t[nd + i] = ref[nd + i];
+			++j;
+		} else {
+			/* this block should not be read or written */
+			t[nd + i] = 0;
+		}
+	}
+
+	raid_data(nr, id, ip, nd, size, t);
+
+	/* compare all data and parity */
+	for (i = 0; i < nd; ++i) {
+		if (t[i] != ref[i]
+			&& t[i] != 0
+			&& memcmp(t[i], ref[i], size) != 0) {
+			/* LCOV_EXCL_START */
+			return -1;
+			/* LCOV_EXCL_STOP */
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Scan test.
+ */
+static int raid_test_scan(int nr, int *ir, int nd, int np, size_t size, void **v, void **ref)
+{
+	int i, j, ret;
+	void *t[TEST_COUNT + RAID_PARITY_MAX];
+	int is[RAID_PARITY_MAX];
+
+	/* setup data and parity vector */
+	for (i = 0, j = 0; i < nd + np; ++i) {
+		if (j < nr && ir[j] == i) {
+			/* this block is bad */
+			t[i] = v[i];
+			++j;
+		} else {
+			/* this block is used for recovering */
+			t[i] = ref[i];
+		}
+	}
+
+	ret = raid_scan(is, nd, np, size, t);
+
+	/* compare identified bad blocks */
+	if (ret != nr)
+		return -1;
+	for (i = 0; i < nr; ++i) {
+		if (ir[i] != is[i]) {
+			/* LCOV_EXCL_START */
+			return -1;
+			/* LCOV_EXCL_STOP */
+		}
+	}
+
+	return 0;
+}
+
+/*
+ * Basic functionality self test.
+ */
+int raid_selftest(void)
+{
+	const int nd = TEST_COUNT;
+	const size_t size = TEST_SIZE;
+	const int nv = nd + RAID_PARITY_MAX * 2 + 1;
+	void *v_alloc;
+	void **v;
+	void *ref[nd + RAID_PARITY_MAX];
+	int ir[RAID_PARITY_MAX];
+	int ip[RAID_PARITY_MAX];
+	int i, np;
+	int ret = 0;
+
+	/* ensure to have enough space for data */
+	BUG_ON(nd * size > 65536);
+
+	v = raid_malloc_vector(nd, nv, size, &v_alloc);
+	if (!v) {
+		/* LCOV_EXCL_START */
+		return -1;
+		/* LCOV_EXCL_STOP */
+	}
+
+	memset(v[nv - 1], 0, size);
+	raid_zero(v[nv - 1]);
+
+	/* use the multiplication table as data */
+	for (i = 0; i < nd; ++i)
+		ref[i] = ((uint8_t *)gfmul) + size * i;
+
+	/* setup reference parity */
+	for (i = 0; i < RAID_PARITY_MAX; ++i)
+		ref[nd + i] = v[nd + RAID_PARITY_MAX + i];
+
+	/* compute reference parity */
+	raid_gen_ref(nd, RAID_PARITY_MAX, size, ref);
+
+	/* test for each parity level */
+	for (np = 1; np <= RAID_PARITY_MAX; ++np) {
+		/* test parity generation */
+		ret = raid_test_par(nd, np, size, v, ref);
+		if (ret != 0) {
+			/* LCOV_EXCL_START */
+			goto bail;
+			/* LCOV_EXCL_STOP */
+		}
+
+		/* test recovering with broken ending data disks */
+		for (i = 0; i < np; ++i) {
+			/* bad data */
+			ir[i] = nd - np + i;
+
+			/* good parity */
+			ip[i] = i;
+		}
+
+		ret = raid_test_rec(np, ir, nd, np, size, v, ref);
+		if (ret != 0) {
+			/* LCOV_EXCL_START */
+			goto bail;
+			/* LCOV_EXCL_STOP */
+		}
+
+		ret = raid_test_data(np, ir, ip, nd, np, size, v, ref);
+		if (ret != 0) {
+			/* LCOV_EXCL_START */
+			goto bail;
+			/* LCOV_EXCL_STOP */
+		}
+
+		/* test recovering with broken leading data and broken leading parity */
+		for (i = 0; i < np / 2; ++i) {
+			/* bad data */
+			ir[i] = i;
+
+			/* good parity */
+			ip[i] = (np + 1) / 2 + i;
+		}
+
+		/* bad parity */
+		for (i = 0; i < (np + 1) / 2; ++i)
+			ir[np / 2 + i] = nd + i;
+
+		ret = raid_test_rec(np, ir, nd, np, size, v, ref);
+		if (ret != 0) {
+			/* LCOV_EXCL_START */
+			goto bail;
+			/* LCOV_EXCL_STOP */
+		}
+
+		ret = raid_test_data(np / 2, ir, ip, nd, np, size, v, ref);
+		if (ret != 0) {
+			/* LCOV_EXCL_START */
+			goto bail;
+			/* LCOV_EXCL_STOP */
+		}
+
+		/* test recovering with broken leading data and broken ending parity */
+		for (i = 0; i < np / 2; ++i) {
+			/* bad data */
+			ir[i] = i;
+
+			/* good parity */
+			ip[i] = i;
+		}
+
+		/* bad parity */
+		for (i = 0; i < (np + 1) / 2; ++i)
+			ir[np / 2 + i] = nd + np - (np + 1) / 2 + i;
+
+		ret = raid_test_rec(np, ir, nd, np, size, v, ref);
+		if (ret != 0) {
+			/* LCOV_EXCL_START */
+			goto bail;
+			/* LCOV_EXCL_STOP */
+		}
+
+		ret = raid_test_data(np / 2, ir, ip, nd, np, size, v, ref);
+		if (ret != 0) {
+			/* LCOV_EXCL_START */
+			goto bail;
+			/* LCOV_EXCL_STOP */
+		}
+
+		/* scan test with broken data and parity */
+		for (i = 0; i < np / 2; ++i) {
+			/* bad data */
+			ir[i] = i;
+		}
+		for (i = 0; i < (np - 1) / 2; ++i) {
+			/* bad parity */
+			ir[np / 2 + i] = nd + i;
+		}
+		for (i = 0; i < np - 1; ++i) {
+			/* make blocks bad */
+			/* we cannot fill them with 0, because the original */
+			/* data may be already filled with 0 */
+			memset(v[ir[i]], 0x55, size);
+		}
+
+		ret = raid_test_scan(np - 1, ir, nd, np, size, v, ref);
+		if (ret != 0) {
+			/* LCOV_EXCL_START */
+			goto bail;
+			/* LCOV_EXCL_STOP */
+		}
+	}
+
+	/* scan test with no parity */
+	ret = raid_test_scan(0, 0, nd, 0, size, v, ref);
+	if (ret != -1) {
+		/* LCOV_EXCL_START */
+		goto bail;
+		/* LCOV_EXCL_STOP */
+	}
+
+	ret = 0;
+
+bail:
+	free(v);
+	free(v_alloc);
+
+	return ret;
+}
+
--- a/raid/raid.c
+++ b/raid/raid.c
@ -0,0 +1,586 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+#include "gf.h"
+
+/*
+ * This is a RAID implementation working in the Galois Field GF(2^8) with
+ * the primitive polynomial x^8 + x^4 + x^3 + x^2 + 1 (285 decimal), and
+ * supporting up to six parity levels.
+ *
+ * For RAID5 and RAID6 it works as as described in the H. Peter Anvin's
+ * paper "The mathematics of RAID-6" [1]. Please refer to this paper for a
+ * complete explanation.
+ *
+ * To support triple parity, it was first evaluated and then dropped, an
+ * extension of the same approach, with additional parity coefficients set
+ * as powers of 2^-1, with equations:
+ *
+ * P = sum(Di)
+ * Q = sum(2^i * Di)
+ * R = sum(2^-i * Di) with 0<=i<N
+ *
+ * This approach works well for triple parity and it's very efficient,
+ * because we can implement very fast parallel multiplications and
+ * divisions by 2 in GF(2^8).
+ *
+ * It's also similar at the approach used by ZFS RAIDZ3, with the
+ * difference that ZFS uses powers of 4 instead of 2^-1.
+ *
+ * Unfortunately it doesn't work beyond triple parity, because whatever
+ * value we choose to generate the power coefficients to compute other
+ * parities, the resulting equations are not solvable for some
+ * combinations of missing disks.
+ *
+ * This is expected, because the Vandermonde matrix used to compute the
+ * parity has no guarantee to have all submatrices not singular
+ * [2, Chap 11, Problem 7] and this is a requirement to have
+ * a MDS (Maximum Distance Separable) code [2, Chap 11, Theorem 8].
+ *
+ * To overcome this limitation, we use a Cauchy matrix [3][4] to compute
+ * the parity. A Cauchy matrix has the property to have all the square
+ * submatrices not singular, resulting in always solvable equations,
+ * for any combination of missing disks.
+ *
+ * The problem of this approach is that it requires the use of
+ * generic multiplications, and not only by 2 or 2^-1, potentially
+ * affecting badly the performance.
+ *
+ * Hopefully there is a method to implement parallel multiplications
+ * using SSSE3 or AVX2 instructions [1][5]. Method competitive with the
+ * computation of triple parity using power coefficients.
+ *
+ * Another important property of the Cauchy matrix is that we can setup
+ * the first two rows with coeffients equal at the RAID5 and RAID6 approach
+ * decribed, resulting in a compatible extension, and requiring SSSE3
+ * or AVX2 instructions only if triple parity or beyond is used.
+ *
+ * The matrix is also adjusted, multipling each row by a constant factor
+ * to make the first column of all 1, to optimize the computation for
+ * the first disk.
+ *
+ * This results in the matrix A[row,col] defined as:
+ *
+ * 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01...
+ * 01 02 04 08 10 20 40 80 1d 3a 74 e8 cd 87 13 26 4c 98 2d 5a b4 75...
+ * 01 f5 d2 c4 9a 71 f1 7f fc 87 c1 c6 19 2f 40 55 3d ba 53 04 9c 61...
+ * 01 bb a6 d7 c7 07 ce 82 4a 2f a5 9b b6 60 f1 ad e7 f4 06 d2 df 2e...
+ * 01 97 7f 9c 7c 18 bd a2 58 1a da 74 70 a3 e5 47 29 07 f5 80 23 e9...
+ * 01 2b 3f cf 73 2c d6 ed cb 74 15 78 8a c1 17 c9 89 68 21 ab 76 3b...
+ *
+ * This matrix supports 6 level of parity, one for each row, for up to 251
+ * data disks, one for each column, with all the 377,342,351,231 square
+ * submatrices not singular, verified also with brute-force.
+ *
+ * This matrix can be extended to support any number of parities, just
+ * adding additional rows, and removing one column for each new row.
+ * (see mktables.c for more details in how the matrix is generated)
+ *
+ * In details, parity is computed as:
+ *
+ * P = sum(Di)
+ * Q = sum(2^i *  Di)
+ * R = sum(A[2,i] * Di)
+ * S = sum(A[3,i] * Di)
+ * T = sum(A[4,i] * Di)
+ * U = sum(A[5,i] * Di) with 0<=i<N
+ *
+ * To recover from a failure of six disks at indexes x,y,z,h,v,w,
+ * with 0<=x<y<z<h<v<w<N, we compute the parity of the available N-6
+ * disks as:
+ *
+ * Pa = sum(Di)
+ * Qa = sum(2^i * Di)
+ * Ra = sum(A[2,i] * Di)
+ * Sa = sum(A[3,i] * Di)
+ * Ta = sum(A[4,i] * Di)
+ * Ua = sum(A[5,i] * Di) with 0<=i<N,i!=x,i!=y,i!=z,i!=h,i!=v,i!=w.
+ *
+ * And if we define:
+ *
+ * Pd = Pa + P
+ * Qd = Qa + Q
+ * Rd = Ra + R
+ * Sd = Sa + S
+ * Td = Ta + T
+ * Ud = Ua + U
+ *
+ * we can sum these two sets of equations, obtaining:
+ *
+ * Pd =          Dx +          Dy +          Dz +          Dh +          Dv +          Dw
+ * Qd =    2^x * Dx +    2^y * Dy +    2^z * Dz +    2^h * Dh +    2^v * Dv +    2^w * Dw
+ * Rd = A[2,x] * Dx + A[2,y] * Dy + A[2,z] * Dz + A[2,h] * Dh + A[2,v] * Dv + A[2,w] * Dw
+ * Sd = A[3,x] * Dx + A[3,y] * Dy + A[3,z] * Dz + A[3,h] * Dh + A[3,v] * Dv + A[3,w] * Dw
+ * Td = A[4,x] * Dx + A[4,y] * Dy + A[4,z] * Dz + A[4,h] * Dh + A[4,v] * Dv + A[4,w] * Dw
+ * Ud = A[5,x] * Dx + A[5,y] * Dy + A[5,z] * Dz + A[5,h] * Dh + A[5,v] * Dv + A[5,w] * Dw
+ *
+ * A linear system always solvable because the coefficients matrix is
+ * always not singular due the properties of the matrix A[].
+ *
+ * Resulting speed in x64, with 8 data disks, using a stripe of 256 KiB,
+ * for a Core i5-4670K Haswell Quad-Core 3.4GHz is:
+ *
+ *             int8   int32   int64    sse2   ssse3    avx2
+ *   gen1             13339   25438   45438           50588
+ *   gen2              4115    6514   21840           32201
+ *   gen3       814                           10154   18613
+ *   gen4       620                            7569   14229
+ *   gen5       496                            5149   10051
+ *   gen6       413                            4239    8190
+ *
+ * Values are in MiB/s of data processed by a single thread, not counting
+ * generated parity.
+ *
+ * You can replicate these results in your machine using the
+ * "raid/test/speedtest.c" program.
+ *
+ * For comparison, the triple parity computation using the power
+ * coeffients "1,2,2^-1" is only a little faster than the one based on
+ * the Cauchy matrix if SSSE3 or AVX2 is present.
+ *
+ *             int8   int32   int64    sse2   ssse3    avx2
+ *   genz              2337    2874   10920           18944
+ *
+ * In conclusion, the use of power coefficients, and specifically powers
+ * of 1,2,2^-1, is the best option to implement triple parity in CPUs
+ * without SSSE3 and AVX2.
+ * But if a modern CPU with SSSE3 or AVX2 is available, the Cauchy
+ * matrix is the best option because it provides a fast and general
+ * approach working for any number of parities.
+ *
+ * References:
+ * [1] Anvin, "The mathematics of RAID-6", 2004
+ * [2] MacWilliams, Sloane, "The Theory of Error-Correcting Codes", 1977
+ * [3] Blomer, "An XOR-Based Erasure-Resilient Coding Scheme", 1995
+ * [4] Roth, "Introduction to Coding Theory", 2006
+ * [5] Plank, "Screaming Fast Galois Field Arithmetic Using Intel SIMD Instructions", 2013
+ */
+
+/**
+ * Generator matrix currently used.
+ */
+const uint8_t (*raid_gfgen)[256];
+
+void raid_mode(int mode)
+{
+	if (mode == RAID_MODE_VANDERMONDE) {
+		raid_gen_ptr[2] = raid_genz_ptr;
+		raid_gfgen = gfvandermonde;
+	} else {
+		raid_gen_ptr[2] = raid_gen3_ptr;
+		raid_gfgen = gfcauchy;
+	}
+}
+
+/**
+ * Buffer filled with 0 used in recovering.
+ */
+static void *raid_zero_block;
+
+void raid_zero(void *zero)
+{
+	raid_zero_block = zero;
+}
+
+/*
+ * Forwarders for parity computation.
+ *
+ * These functions compute the parity blocks from the provided data.
+ *
+ * The number of parities to compute is implicit in the position in the
+ * forwarder vector. Position at index #i, computes (#i+1) parities.
+ *
+ * All these functions give the guarantee that parities are written
+ * in order. First parity P, then parity Q, and so on.
+ * This allows to specify the same memory buffer for multiple parities
+ * knowning that you'll get the latest written one.
+ * This characteristic is used by the raid_delta_gen() function to
+ * avoid to damage unused parities in recovering.
+ *
+ * @nd Number of data blocks
+ * @size Size of the blocks pointed by @v. It must be a multipler of 64.
+ * @v Vector of pointers to the blocks of data and parity.
+ *   It has (@nd + #parities) elements. The starting elements are the blocks
+ *   for data, following with the parity blocks.
+ *   Each block has @size bytes.
+ */
+void (*raid_gen_ptr[RAID_PARITY_MAX])(int nd, size_t size, void **vv);
+void (*raid_gen3_ptr)(int nd, size_t size, void **vv);
+void (*raid_genz_ptr)(int nd, size_t size, void **vv);
+
+void raid_gen(int nd, int np, size_t size, void **v)
+{
+	/* enforce limit on size */
+	BUG_ON(size % 64 != 0);
+
+	/* enforce limit on number of failures */
+	BUG_ON(np < 1);
+	BUG_ON(np > RAID_PARITY_MAX);
+
+	raid_gen_ptr[np - 1](nd, size, v);
+}
+
+/**
+ * Inverts the square matrix M of size nxn into V.
+ *
+ * This is not a general matrix inversion because we assume the matrix M
+ * to have all the square submatrix not singular.
+ * We use Gauss elimination to invert.
+ *
+ * @M Matrix to invert with @n rows and @n columns.
+ * @V Destination matrix where the result is put.
+ * @n Number of rows and columns of the matrix.
+ */
+void raid_invert(uint8_t *M, uint8_t *V, int n)
+{
+	int i, j, k;
+
+	/* set the identity matrix in V */
+	for (i = 0; i < n; ++i)
+		for (j = 0; j < n; ++j)
+			V[i * n + j] = i == j;
+
+	/* for each element in the diagonal */
+	for (k = 0; k < n; ++k) {
+		uint8_t f;
+
+		/* the diagonal element cannot be 0 because */
+		/* we are inverting matrices with all the square */
+		/* submatrices not singular */
+		BUG_ON(M[k * n + k] == 0);
+
+		/* make the diagonal element to be 1 */
+		f = inv(M[k * n + k]);
+		for (j = 0; j < n; ++j) {
+			M[k * n + j] = mul(f, M[k * n + j]);
+			V[k * n + j] = mul(f, V[k * n + j]);
+		}
+
+		/* make all the elements over and under the diagonal */
+		/* to be zero */
+		for (i = 0; i < n; ++i) {
+			if (i == k)
+				continue;
+			f = M[i * n + k];
+			for (j = 0; j < n; ++j) {
+				M[i * n + j] ^= mul(f, M[k * n + j]);
+				V[i * n + j] ^= mul(f, V[k * n + j]);
+			}
+		}
+	}
+}
+
+/**
+ * Computes the parity without the missing data blocks
+ * and store it in the buffers of such data blocks.
+ *
+ * This is the parity expressed as Pa,Qa,Ra,Sa,Ta,Ua in the equations.
+ */
+void raid_delta_gen(int nr, int *id, int *ip, int nd, size_t size, void **v)
+{
+	void *p[RAID_PARITY_MAX];
+	void *pa[RAID_PARITY_MAX];
+	int i, j;
+	int np;
+	void *latest;
+
+	/* total number of parities we are going to process */
+	/* they are both the used and the unused ones */
+	np = ip[nr - 1] + 1;
+
+	/* latest missing data block */
+	latest = v[id[nr - 1]];
+
+	/* setup pointers for delta computation */
+	for (i = 0, j = 0; i < np; ++i) {
+		/* keep a copy of the original parity vector */
+		p[i] = v[nd + i];
+
+		if (ip[j] == i) {
+			/*
+			 * Set used parities to point to the missing
+			 * data blocks.
+			 *
+			 * The related data blocks are instead set
+			 * to point to the "zero" buffer.
+			 */
+
+			/* the latest parity to use ends the for loop and */
+			/* then it cannot happen to process more of them */
+			BUG_ON(j >= nr);
+
+			/* buffer for missing data blocks */
+			pa[j] = v[id[j]];
+
+			/* set at zero the missing data blocks */
+			v[id[j]] = raid_zero_block;
+
+			/* compute the parity over the missing data blocks */
+			v[nd + i] = pa[j];
+
+			/* check for the next used entry */
+			++j;
+		} else {
+			/*
+			 * Unused parities are going to be rewritten with
+			 * not significative data, becase we don't have
+			 * functions able to compute only a subset of
+			 * parities.
+			 *
+			 * To avoid this, we reuse parity buffers,
+			 * assuming that all the parity functions write
+			 * parities in order.
+			 *
+			 * We assign the unused parity block to the same
+			 * block of the latest used parity that we know it
+			 * will be written.
+			 *
+			 * This means that this block will be written
+			 * multiple times and only the latest write will
+			 * contain the correct data.
+			 */
+			v[nd + i] = latest;
+		}
+	}
+
+	/* all the parities have to be processed */
+	BUG_ON(j != nr);
+
+	/* recompute the parity, note that np may be smaller than the */
+	/* total number of parities available */
+	raid_gen(nd, np, size, v);
+
+	/* restore data buffers as before */
+	for (j = 0; j < nr; ++j)
+		v[id[j]] = pa[j];
+
+	/* restore parity buffers as before */
+	for (i = 0; i < np; ++i)
+		v[nd + i] = p[i];
+}
+
+/**
+ * Recover failure of one data block for PAR1.
+ *
+ * Starting from the equation:
+ *
+ * Pd = Dx
+ *
+ * and solving we get:
+ *
+ * Dx = Pd
+ */
+void raid_rec1of1(int *id, int nd, size_t size, void **v)
+{
+	void *p;
+	void *pa;
+
+	/* for PAR1 we can directly compute the missing block */
+	/* and we don't need to use the zero buffer */
+	p = v[nd];
+	pa = v[id[0]];
+
+	/* use the parity as missing data block */
+	v[id[0]] = p;
+
+	/* compute the parity over the missing data block */
+	v[nd] = pa;
+
+	/* compute */
+	raid_gen(nd, 1, size, v);
+
+	/* restore as before */
+	v[id[0]] = pa;
+	v[nd] = p;
+}
+
+/**
+ * Recover failure of two data blocks for PAR2.
+ *
+ * Starting from the equations:
+ *
+ * Pd = Dx + Dy
+ * Qd = 2^id[0] * Dx + 2^id[1] * Dy
+ *
+ * and solving we get:
+ *
+ *               1                     2^(-id[0])
+ * Dy = ------------------- * Pd + ------------------- * Qd
+ *      2^(id[1]-id[0]) + 1        2^(id[1]-id[0]) + 1
+ *
+ * Dx = Dy + Pd
+ *
+ * with conditions:
+ *
+ * 2^id[0] != 0
+ * 2^(id[1]-id[0]) + 1 != 0
+ *
+ * That are always satisfied for any 0<=id[0]<id[1]<255.
+ */
+void raid_rec2of2_int8(int *id, int *ip, int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t **)vv;
+	size_t i;
+	uint8_t *p;
+	uint8_t *pa;
+	uint8_t *q;
+	uint8_t *qa;
+	const uint8_t *T[2];
+
+	/* get multiplication tables */
+	T[0] = table(inv(pow2(id[1] - id[0]) ^ 1));
+	T[1] = table(inv(pow2(id[0]) ^ pow2(id[1])));
+
+	/* compute delta parity */
+	raid_delta_gen(2, id, ip, nd, size, vv);
+
+	p = v[nd];
+	q = v[nd + 1];
+	pa = v[id[0]];
+	qa = v[id[1]];
+
+	for (i = 0; i < size; ++i) {
+		/* delta */
+		uint8_t Pd = p[i] ^ pa[i];
+		uint8_t Qd = q[i] ^ qa[i];
+
+		/* reconstruct */
+		uint8_t Dy = T[0][Pd] ^ T[1][Qd];
+		uint8_t Dx = Pd ^ Dy;
+
+		/* set */
+		pa[i] = Dx;
+		qa[i] = Dy;
+	}
+}
+
+/*
+ * Forwarders for data recovery.
+ *
+ * These functions recover data blocks using the specified parity
+ * to recompute the missing data.
+ *
+ * Note that the format of vectors @id/@ip is different than raid_rec().
+ * For example, in the vector @ip the first parity is represented with the
+ * value 0 and not @nd.
+ *
+ * @nr Number of failed data blocks to recover.
+ * @id[] Vector of @nr indexes of the data blocks to recover.
+ *   The indexes start from 0. They must be in order.
+ * @ip[] Vector of @nr indexes of the parity blocks to use in the recovering.
+ *   The indexes start from 0. They must be in order.
+ * @nd Number of data blocks.
+ * @np Number of parity blocks.
+ * @size Size of the blocks pointed by @v. It must be a multipler of 64.
+ * @v Vector of pointers to the blocks of data and parity.
+ *   It has (@nd + @np) elements. The starting elements are the blocks
+ *   for data, following with the parity blocks.
+ *   Each block has @size bytes.
+ */
+void (*raid_rec_ptr[RAID_PARITY_MAX])(
+	int nr, int *id, int *ip, int nd, size_t size, void **vv);
+
+void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
+{
+	int nrd; /* number of data blocks to recover */
+	int nrp; /* number of parity blocks to recover */
+
+	/* enforce limit on size */
+	BUG_ON(size % 64 != 0);
+
+	/* enforce limit on number of failures */
+	BUG_ON(nr > np);
+	BUG_ON(np > RAID_PARITY_MAX);
+
+	/* enforce order in index vector */
+	BUG_ON(nr >= 2 && ir[0] >= ir[1]);
+	BUG_ON(nr >= 3 && ir[1] >= ir[2]);
+	BUG_ON(nr >= 4 && ir[2] >= ir[3]);
+	BUG_ON(nr >= 5 && ir[3] >= ir[4]);
+	BUG_ON(nr >= 6 && ir[4] >= ir[5]);
+
+	/* enforce limit on index vector */
+	BUG_ON(nr > 0 && ir[nr-1] >= nd + np);
+
+	/* count the number of data blocks to recover */
+	nrd = 0;
+	while (nrd < nr && ir[nrd] < nd)
+		++nrd;
+
+	/* all the remaining are parity */
+	nrp = nr - nrd;
+
+	/* enforce limit on number of failures */
+	BUG_ON(nrd > nd);
+	BUG_ON(nrp > np);
+
+	/* if failed data is present */
+	if (nrd != 0) {
+		int ip[RAID_PARITY_MAX];
+		int i, j, k;
+
+		/* setup the vector of parities to use */
+		for (i = 0, j = 0, k = 0; i < np; ++i) {
+			if (j < nrp && ir[nrd + j] == nd + i) {
+				/* this parity has to be recovered */
+				++j;
+			} else {
+				/* this parity is used for recovering */
+				ip[k] = i;
+				++k;
+			}
+		}
+
+		/* recover the nrd data blocks specified in ir[], */
+		/* using the first nrd parity in ip[] for recovering */
+		raid_rec_ptr[nrd - 1](nrd, ir, ip, nd, size, v);
+	}
+
+	/* recompute all the parities up to the last bad one */
+	if (nrp != 0)
+		raid_gen(nd, ir[nr - 1] - nd + 1, size, v);
+}
+
+void raid_data(int nr, int *id, int *ip, int nd, size_t size, void **v)
+{
+	/* enforce limit on size */
+	BUG_ON(size % 64 != 0);
+
+	/* enforce limit on number of failures */
+	BUG_ON(nr > nd);
+	BUG_ON(nr > RAID_PARITY_MAX);
+
+	/* enforce order in index vector for data */
+	BUG_ON(nr >= 2 && id[0] >= id[1]);
+	BUG_ON(nr >= 3 && id[1] >= id[2]);
+	BUG_ON(nr >= 4 && id[2] >= id[3]);
+	BUG_ON(nr >= 5 && id[3] >= id[4]);
+	BUG_ON(nr >= 6 && id[4] >= id[5]);
+
+	/* enforce limit on index vector for data */
+	BUG_ON(nr > 0 && id[nr-1] >= nd);
+
+	/* enforce order in index vector for parity */
+	BUG_ON(nr >= 2 && ip[0] >= ip[1]);
+	BUG_ON(nr >= 3 && ip[1] >= ip[2]);
+	BUG_ON(nr >= 4 && ip[2] >= ip[3]);
+	BUG_ON(nr >= 5 && ip[3] >= ip[4]);
+	BUG_ON(nr >= 6 && ip[4] >= ip[5]);
+
+	/* if failed data is present */
+	if (nr != 0)
+		raid_rec_ptr[nr - 1](nr, id, ip, nd, size, v);
+}
+
--- a/raid/raid.h
+++ b/raid/raid.h
@ -0,0 +1,229 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __RAID_H
+#define __RAID_H
+
+/**
+ * RAID mode supporting up to 6 parities.
+ *
+ * It requires SSSE3 to get good performance with triple or more parities.
+ *
+ * This is the default mode set after calling raid_init().
+ */
+#define RAID_MODE_CAUCHY 0
+
+/**
+ * RAID mode supporting up to 3 parities,
+ *
+ * It has a fast triple parity implementation without SSSE3, but it cannot
+ * go beyond triple parity.
+ *
+ * This is mostly intended for low end CPUs like ARM and AMD Athlon.
+ */
+#define RAID_MODE_VANDERMONDE 1
+
+/**
+ * Maximum number of parity disks supported.
+ */
+#define RAID_PARITY_MAX 6
+
+/**
+ * Maximum number of data disks supported.
+ */
+#define RAID_DATA_MAX 251
+
+/**
+ * Initializes the RAID system.
+ *
+ * You must call this function before any other.
+ *
+ * The RAID system is initialized in the RAID_MODE_CAUCHY mode.
+ */
+void raid_init(void);
+
+/**
+ * Runs a basic functionality self test.
+ *
+ * The test is immediate, and it's intended to be run at application
+ * startup to check the integrity of the RAID system.
+ *
+ * It returns 0 on success.
+ */
+int raid_selftest(void);
+
+/**
+ * Sets the mode to use. One of RAID_MODE_*.
+ *
+ * You can change mode at any time, and it will affect next calls to raid_gen(),
+ * raid_rec() and raid_data().
+ *
+ * The two modes are compatible for the first two levels of parity.
+ * The third one is different.
+ */
+void raid_mode(int mode);
+
+/**
+ * Sets the zero buffer to use in recovering.
+ *
+ * Before calling raid_rec() and raid_data() you must provide a memory
+ * buffer filled with zero with the same size of the blocks to recover.
+ *
+ * This buffer is only read and never written.
+ */
+void raid_zero(void *zero);
+
+/**
+ * Computes parity blocks.
+ *
+ * This function computes the specified number of parity blocks of the
+ * provided set of data blocks.
+ *
+ * Each parity block allows to recover one data block.
+ *
+ * @nd Number of data blocks.
+ * @np Number of parities blocks to compute.
+ * @size Size of the blocks pointed by @v. It must be a multiplier of 64.
+ * @v Vector of pointers to the blocks of data and parity.
+ *   It has (@nd + @np) elements. The starting elements are the blocks for
+ *   data, following with the parity blocks.
+ *   Data blocks are only read and not modified. Parity blocks are written.
+ *   Each block has @size bytes.
+ */
+void raid_gen(int nd, int np, size_t size, void **v);
+
+/**
+ * Recovers failures in data and parity blocks.
+ *
+ * This function recovers all the data and parity blocks marked as bad
+ * in the @ir vector.
+ *
+ * Ensure to have @nr <= @np, otherwise recovering is not possible.
+ *
+ * The parities blocks used for recovering are automatically selected from
+ * the ones NOT present in the @ir vector.
+ *
+ * In case there are more parity blocks than needed, the parities at lower
+ * indexes are used in the recovering, and the others are ignored.
+ *
+ * Note that no internal integrity check is done when recovering. If the
+ * provided parities are correct, the resulting data will be correct.
+ * If parities are wrong, the resulting recovered data will be wrong.
+ * This happens even in the case you have more parities blocks than needed,
+ * and some form of integrity verification would be possible.
+ *
+ * @nr Number of failed data and parity blocks to recover.
+ * @ir[] Vector of @nr indexes of the failed data and parity blocks.
+ *   The indexes start from 0. They must be in order.
+ *   The first parity is represented with value @nd, the second with value
+ *   @nd + 1, just like positions in the @v vector.
+ * @nd Number of data blocks.
+ * @np Number of parity blocks.
+ * @size Size of the blocks pointed by @v. It must be a multiplier of 64.
+ * @v Vector of pointers to the blocks of data and parity.
+ *   It has (@nd + @np) elements. The starting elements are the blocks
+ *   for data, following with the parity blocks.
+ *   Each block has @size bytes.
+ */
+void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v);
+
+/**
+ * Recovers failures in data blocks only.
+ *
+ * This function recovers all the data blocks marked as bad in the @id vector.
+ * The parity blocks are not modified.
+ *
+ * @nr Number of failed data blocks to recover.
+ * @id[] Vector of @nr indexes of the data blocks to recover.
+ *   The indexes start from 0. They must be in order.
+ * @ip[] Vector of @nr indexes of the parity blocks to use for recovering.
+ *   The indexes start from 0. They must be in order.
+ * @nd Number of data blocks.
+ * @size Size of the blocks pointed by @v. It must be a multiplier of 64.
+ * @v Vector of pointers to the blocks of data and parity.
+ *   It has (@nd + @ip[@nr - 1] + 1) elements. The starting elements are the
+ *   blocks for data, following with the parity blocks.
+ *   Each blocks has @size bytes.
+ */
+void raid_data(int nr, int *id, int *ip, int nd, size_t size, void **v);
+
+/**
+ * Check the provided failed blocks combination.
+ *
+ * This function checks if the specified failed blocks combination satisfies
+ * the redundancy information. A combination is assumed matching, if the
+ * remaining valid parity is matching the expected value after recovering.
+ *
+ * The number of failed blocks @nr must be strictly less than the number of
+ * parities @np, because you need one more parity to validate the recovering.
+ *
+ * No data or parity blocks are modified.
+ *
+ * @nr Number of failed data and parity blocks.
+ * @ir[] Vector of @nr indexes of the failed data and parity blocks.
+ *   The indexes start from 0. They must be in order.
+ *   The first parity is represented with value @nd, the second with value
+ *   @nd + 1, just like positions in the @v vector.
+ * @nd Number of data blocks.
+ * @np Number of parity blocks.
+ * @size Size of the blocks pointed by @v. It must be a multiplier of 64.
+ * @v Vector of pointers to the blocks of data and parity.
+ *   It has (@nd + @np) elements. The starting elements are the blocks
+ *   for data, following with the parity blocks.
+ *   Each block has @size bytes.
+ * @return 0 if the check is satisfied. -1 otherwise.
+ */
+int raid_check(int nr, int *ir, int nd, int np, size_t size, void **v);
+
+/**
+ * Scan for failed blocks.
+ *
+ * This function identifies the failed data and parity blocks using the
+ * available redundancy.
+ *
+ * It uses a brute force method, and then the call can be expansive.
+ * The expected execution time is proportional at the binomial coefficient
+ * @np + @nd choose @np - 1, usually written as:
+ *
+ * ( @np + @nd )
+ * (           )
+ * (  @np - 1  )
+ *
+ * No data or parity blocks are modified.
+ *
+ * The failed block indexes are returned in the @ir vector.
+ * It must have space for at least @np - 1 values.
+ *
+ * The returned @ir vector can then be used in a raid_rec() call to recover
+ * the failed data and parity blocks.
+ *
+ * @ir[] Vector filled with the indexes of the failed data and parity blocks.
+ *   The indexes start from 0 and they are in order.
+ *   The first parity is represented with value @nd, the second with value
+ *   @nd + 1, just like positions in the @v vector.
+ * @nd Number of data blocks.
+ * @np Number of parity blocks.
+ * @size Size of the blocks pointed by @v. It must be a multiplier of 64.
+ * @v Vector of pointers to the blocks of data and parity.
+ *   It has (@nd + @np) elements. The starting elements are the blocks
+ *   for data, following with the parity blocks.
+ *   Each block has @size bytes.
+ * @return Number of block indexes returned in the @ir vector.
+ *   0 if no error is detected.
+ *   -1 if it's not possible to identify the failed disks.
+ */
+int raid_scan(int *ir, int nd, int np, size_t size, void **v);
+
+#endif
+
--- a/raid/tables.c
+++ b/raid/tables.c
--- a/raid/tag.c
+++ b/raid/tag.c
@ -0,0 +1,145 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+
+static struct raid_func {
+	const char *name;
+	void (*p)();
+} RAID_FUNC[] = {
+	{ "int8", raid_gen3_int8 },
+	{ "int8", raid_gen4_int8 },
+	{ "int8", raid_gen5_int8 },
+	{ "int8", raid_gen6_int8 },
+	{ "int32", raid_gen1_int32 },
+	{ "int64", raid_gen1_int64 },
+	{ "int32", raid_gen2_int32 },
+	{ "int64", raid_gen2_int64 },
+	{ "int32", raid_genz_int32 },
+	{ "int64", raid_genz_int64 },
+	{ "int8", raid_rec1_int8 },
+	{ "int8", raid_rec2_int8 },
+	{ "int8", raid_recX_int8 },
+
+#ifdef CONFIG_X86
+#ifdef CONFIG_SSE2
+	{ "sse2", raid_gen1_sse2 },
+	{ "sse2", raid_gen2_sse2 },
+	{ "sse2", raid_genz_sse2 },
+#endif
+#ifdef CONFIG_SSSE3
+	{ "ssse3", raid_gen3_ssse3 },
+	{ "ssse3", raid_gen4_ssse3 },
+	{ "ssse3", raid_gen5_ssse3 },
+	{ "ssse3", raid_gen6_ssse3 },
+	{ "ssse3", raid_rec1_ssse3 },
+	{ "ssse3", raid_rec2_ssse3 },
+	{ "ssse3", raid_recX_ssse3 },
+#endif
+#ifdef CONFIG_AVX2
+	{ "avx2", raid_gen1_avx2 },
+	{ "avx2", raid_gen2_avx2 },
+	{ "avx2", raid_rec1_avx2 },
+	{ "avx2", raid_rec2_avx2 },
+	{ "avx2", raid_recX_avx2 },
+#endif
+#endif
+
+#ifdef CONFIG_X86_64
+#ifdef CONFIG_SSE2
+	{ "sse2e", raid_gen2_sse2ext },
+	{ "sse2e", raid_genz_sse2ext },
+#endif
+#ifdef CONFIG_SSSE3
+	{ "ssse3e", raid_gen3_ssse3ext },
+	{ "ssse3e", raid_gen4_ssse3ext },
+	{ "ssse3e", raid_gen5_ssse3ext },
+	{ "ssse3e", raid_gen6_ssse3ext },
+#endif
+#ifdef CONFIG_AVX2
+	{ "avx2e", raid_gen3_avx2ext },
+	{ "avx2e", raid_genz_avx2ext },
+	{ "avx2e", raid_gen4_avx2ext },
+	{ "avx2e", raid_gen5_avx2ext },
+	{ "avx2e", raid_gen6_avx2ext },
+#endif
+#endif
+	{ 0, 0 }
+};
+
+static const char *raid_tag(void (*func)())
+{
+	struct raid_func *i = RAID_FUNC;
+
+	while (i->name != 0) {
+		if (i->p == func)
+			return i->name;
+		++i;
+	}
+
+	/* LCOV_EXCL_START */
+	return "unknown";
+	/* LCOV_EXCL_STOP */
+}
+
+const char *raid_gen1_tag(void)
+{
+	return raid_tag(raid_gen_ptr[0]);
+}
+
+const char *raid_gen2_tag(void)
+{
+	return raid_tag(raid_gen_ptr[1]);
+}
+
+const char *raid_genz_tag(void)
+{
+	return raid_tag(raid_genz_ptr);
+}
+
+const char *raid_gen3_tag(void)
+{
+	return raid_tag(raid_gen_ptr[2]);
+}
+
+const char *raid_gen4_tag(void)
+{
+	return raid_tag(raid_gen_ptr[3]);
+}
+
+const char *raid_gen5_tag(void)
+{
+	return raid_tag(raid_gen_ptr[4]);
+}
+
+const char *raid_gen6_tag(void)
+{
+	return raid_tag(raid_gen_ptr[5]);
+}
+
+const char *raid_rec1_tag(void)
+{
+	return raid_tag(raid_rec_ptr[0]);
+}
+
+const char *raid_rec2_tag(void)
+{
+	return raid_tag(raid_rec_ptr[1]);
+}
+
+const char *raid_recX_tag(void)
+{
+	return raid_tag(raid_rec_ptr[2]);
+}
+
--- a/raid/test.c
+++ b/raid/test.c
@ -0,0 +1,452 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+#include "cpu.h"
+#include "combo.h"
+#include "memory.h"
+
+/**
+ * Binomial coefficient of n over r.
+ */
+static int ibc(int n, int r)
+{
+	if (r == 0 || n == r)
+		return 1;
+	else
+		return ibc(n - 1, r - 1) + ibc(n - 1, r);
+}
+
+/**
+ * Power n ^ r;
+ */
+static int ipow(int n, int r)
+{
+	int v = 1;
+
+	while (r) {
+		v *= n;
+		--r;
+	}
+	return v;
+}
+
+int raid_test_combo(void)
+{
+	int r;
+	int count;
+	int p[RAID_PARITY_MAX];
+
+	for (r = 1; r <= RAID_PARITY_MAX; ++r) {
+		/* count combination (r of RAID_PARITY_MAX) elements */
+		count = 0;
+		combination_first(r, RAID_PARITY_MAX, p);
+
+		do {
+			++count;
+		} while (combination_next(r, RAID_PARITY_MAX, p));
+
+		if (count != ibc(RAID_PARITY_MAX, r)) {
+			/* LCOV_EXCL_START */
+			return -1;
+			/* LCOV_EXCL_STOP */
+		}
+	}
+
+	for (r = 1; r <= RAID_PARITY_MAX; ++r) {
+		/* count permutation (r of RAID_PARITY_MAX) elements */
+		count = 0;
+		permutation_first(r, RAID_PARITY_MAX, p);
+
+		do {
+			++count;
+		} while (permutation_next(r, RAID_PARITY_MAX, p));
+
+		if (count != ipow(RAID_PARITY_MAX, r)) {
+			/* LCOV_EXCL_START */
+			return -1;
+			/* LCOV_EXCL_STOP */
+		}
+	}
+
+	return 0;
+}
+
+int raid_test_insert(void)
+{
+	int p[RAID_PARITY_MAX];
+	int r;
+
+	for (r = 1; r <= RAID_PARITY_MAX; ++r) {
+		permutation_first(r, RAID_PARITY_MAX, p);
+		do {
+			int i[RAID_PARITY_MAX];
+			int j;
+
+			/* insert in order */
+			for (j = 0; j < r; ++j)
+				raid_insert(j, i, p[j]);
+
+			/* check order */
+			for (j = 1; j < r; ++j) {
+				if (i[j - 1] > i[j]) {
+					/* LCOV_EXCL_START */
+					return -1;
+					/* LCOV_EXCL_STOP */
+				}
+			}
+		} while (permutation_next(r, RAID_PARITY_MAX, p));
+	}
+
+	return 0;
+}
+
+int raid_test_sort(void)
+{
+	int p[RAID_PARITY_MAX];
+	int r;
+
+	for (r = 1; r <= RAID_PARITY_MAX; ++r) {
+		permutation_first(r, RAID_PARITY_MAX, p);
+		do {
+			int i[RAID_PARITY_MAX];
+			int j;
+
+			/* make a copy */
+			for (j = 0; j < r; ++j)
+				i[j] = p[j];
+
+			raid_sort(r, i);
+
+			/* check order */
+			for (j = 1; j < r; ++j) {
+				if (i[j - 1] > i[j]) {
+					/* LCOV_EXCL_START */
+					return -1;
+					/* LCOV_EXCL_STOP */
+				}
+			}
+		} while (permutation_next(r, RAID_PARITY_MAX, p));
+	}
+
+	return 0;
+}
+
+int raid_test_rec(int mode, int nd, size_t size)
+{
+	void (*f[RAID_PARITY_MAX][4])(
+		int nr, int *id, int *ip, int nd, size_t size, void **vbuf);
+	void *v_alloc;
+	void **v;
+	void **data;
+	void **parity;
+	void **test;
+	void *data_save[RAID_PARITY_MAX];
+	void *parity_save[RAID_PARITY_MAX];
+	void *waste;
+	int nv;
+	int id[RAID_PARITY_MAX];
+	int ip[RAID_PARITY_MAX];
+	int i;
+	int j;
+	int nr;
+	int nf[RAID_PARITY_MAX];
+	int np;
+
+	raid_mode(mode);
+	if (mode == RAID_MODE_CAUCHY)
+		np = RAID_PARITY_MAX;
+	else
+		np = 3;
+
+	nv = nd + np * 2 + 2;
+
+	v = raid_malloc_vector(nd, nv, size, &v_alloc);
+	if (!v) {
+		/* LCOV_EXCL_START */
+		return -1;
+		/* LCOV_EXCL_STOP */
+	}
+
+	data = v;
+	parity = v + nd;
+	test = v + nd + np;
+
+	for (i = 0; i < np; ++i)
+		parity_save[i] = parity[i];
+
+	memset(v[nv - 2], 0, size);
+	raid_zero(v[nv - 2]);
+
+	waste = v[nv - 1];
+
+	/* fill with pseudo-random data with the arbitrary seed "1" */
+	raid_mrand_vector(1, nd, size, v);
+
+	/* setup recov functions */
+	for (i = 0; i < np; ++i) {
+		nf[i] = 0;
+		if (i == 0) {
+			f[i][nf[i]++] = raid_rec1_int8;
+#ifdef CONFIG_X86
+#ifdef CONFIG_SSSE3
+			if (raid_cpu_has_ssse3())
+				f[i][nf[i]++] = raid_rec1_ssse3;
+#endif
+#ifdef CONFIG_AVX2
+			if (raid_cpu_has_avx2())
+				f[i][nf[i]++] = raid_rec1_avx2;
+#endif
+#endif
+		} else if (i == 1) {
+			f[i][nf[i]++] = raid_rec2_int8;
+#ifdef CONFIG_X86
+#ifdef CONFIG_SSSE3
+			if (raid_cpu_has_ssse3())
+				f[i][nf[i]++] = raid_rec2_ssse3;
+#endif
+#ifdef CONFIG_AVX2
+			if (raid_cpu_has_avx2())
+				f[i][nf[i]++] = raid_rec2_avx2;
+#endif
+#endif
+		} else {
+			f[i][nf[i]++] = raid_recX_int8;
+#ifdef CONFIG_X86
+#ifdef CONFIG_SSSE3
+			if (raid_cpu_has_ssse3())
+				f[i][nf[i]++] = raid_recX_ssse3;
+#endif
+#ifdef CONFIG_AVX2
+			if (raid_cpu_has_avx2())
+				f[i][nf[i]++] = raid_recX_avx2;
+#endif
+#endif
+		}
+	}
+
+	/* compute the parity */
+	raid_gen_ref(nd, np, size, v);
+
+	/* set all the parity to the waste v */
+	for (i = 0; i < np; ++i)
+		parity[i] = waste;
+
+	/* all parity levels */
+	for (nr = 1; nr <= np; ++nr) {
+		/* all combinations (nr of nd) disks */
+		combination_first(nr, nd, id);
+		do {
+			/* all combinations (nr of np) parities */
+			combination_first(nr, np, ip);
+			do {
+				/* for each recover function */
+				for (j = 0; j < nf[nr - 1]; ++j) {
+					/* set */
+					for (i = 0; i < nr; ++i) {
+						/* remove the missing data */
+						data_save[i] = data[id[i]];
+						data[id[i]] = test[i];
+						/* set the parity to use */
+						parity[ip[i]] = parity_save[ip[i]];
+					}
+
+					/* recover */
+					f[nr - 1][j](nr, id, ip, nd, size, v);
+
+					/* check */
+					for (i = 0; i < nr; ++i) {
+						if (memcmp(test[i], data_save[i], size) != 0) {
+							/* LCOV_EXCL_START */
+							goto bail;
+							/* LCOV_EXCL_STOP */
+						}
+					}
+
+					/* restore */
+					for (i = 0; i < nr; ++i) {
+						/* restore the data */
+						data[id[i]] = data_save[i];
+						/* restore the parity */
+						parity[ip[i]] = waste;
+					}
+				}
+			} while (combination_next(nr, np, ip));
+		} while (combination_next(nr, nd, id));
+	}
+
+	free(v_alloc);
+	free(v);
+	return 0;
+
+bail:
+	/* LCOV_EXCL_START */
+	free(v_alloc);
+	free(v);
+	return -1;
+	/* LCOV_EXCL_STOP */
+}
+
+int raid_test_par(int mode, int nd, size_t size)
+{
+	void (*f[64])(int nd, size_t size, void **vbuf);
+	void *v_alloc;
+	void **v;
+	int nv;
+	int i, j;
+	int nf;
+	int np;
+
+	raid_mode(mode);
+	if (mode == RAID_MODE_CAUCHY)
+		np = RAID_PARITY_MAX;
+	else
+		np = 3;
+
+	nv = nd + np * 2;
+
+	v = raid_malloc_vector(nd, nv, size, &v_alloc);
+	if (!v) {
+		/* LCOV_EXCL_START */
+		return -1;
+		/* LCOV_EXCL_STOP */
+	}
+
+	/* check memory */
+	if (raid_mtest_vector(nv, size, v) != 0) {
+		/* LCOV_EXCL_START */
+		goto bail;
+		/* LCOV_EXCL_STOP */
+	}
+
+	/* fill with pseudo-random data with the arbitrary seed "2" */
+	raid_mrand_vector(2, nv, size, v);
+
+	/* compute the parity */
+	raid_gen_ref(nd, np, size, v);
+
+	/* copy in back buffers */
+	for (i = 0; i < np; ++i)
+		memcpy(v[nd + np + i], v[nd + i], size);
+
+	/* load all the available functions */
+	nf = 0;
+
+	f[nf++] = raid_gen1_int32;
+	f[nf++] = raid_gen1_int64;
+	f[nf++] = raid_gen2_int32;
+	f[nf++] = raid_gen2_int64;
+
+#ifdef CONFIG_X86
+#ifdef CONFIG_SSE2
+	if (raid_cpu_has_sse2()) {
+		f[nf++] = raid_gen1_sse2;
+		f[nf++] = raid_gen2_sse2;
+#ifdef CONFIG_X86_64
+		f[nf++] = raid_gen2_sse2ext;
+#endif
+	}
+#endif
+
+#ifdef CONFIG_AVX2
+	if (raid_cpu_has_avx2()) {
+		f[nf++] = raid_gen1_avx2;
+		f[nf++] = raid_gen2_avx2;
+	}
+#endif
+#endif /* CONFIG_X86 */
+
+	if (mode == RAID_MODE_CAUCHY) {
+		f[nf++] = raid_gen3_int8;
+		f[nf++] = raid_gen4_int8;
+		f[nf++] = raid_gen5_int8;
+		f[nf++] = raid_gen6_int8;
+
+#ifdef CONFIG_X86
+#ifdef CONFIG_SSSE3
+		if (raid_cpu_has_ssse3()) {
+			f[nf++] = raid_gen3_ssse3;
+			f[nf++] = raid_gen4_ssse3;
+			f[nf++] = raid_gen5_ssse3;
+			f[nf++] = raid_gen6_ssse3;
+#ifdef CONFIG_X86_64
+			f[nf++] = raid_gen3_ssse3ext;
+			f[nf++] = raid_gen4_ssse3ext;
+			f[nf++] = raid_gen5_ssse3ext;
+			f[nf++] = raid_gen6_ssse3ext;
+#endif
+		}
+#endif
+
+#ifdef CONFIG_AVX2
+#ifdef CONFIG_X86_64
+		if (raid_cpu_has_avx2()) {
+			f[nf++] = raid_gen3_avx2ext;
+			f[nf++] = raid_gen4_avx2ext;
+			f[nf++] = raid_gen5_avx2ext;
+			f[nf++] = raid_gen6_avx2ext;
+		}
+#endif
+#endif
+#endif /* CONFIG_X86 */
+	} else {
+		f[nf++] = raid_genz_int32;
+		f[nf++] = raid_genz_int64;
+
+#ifdef CONFIG_X86
+#ifdef CONFIG_SSE2
+		if (raid_cpu_has_sse2()) {
+			f[nf++] = raid_genz_sse2;
+#ifdef CONFIG_X86_64
+			f[nf++] = raid_genz_sse2ext;
+#endif
+		}
+#endif
+
+#ifdef CONFIG_AVX2
+#ifdef CONFIG_X86_64
+		if (raid_cpu_has_avx2())
+			f[nf++] = raid_genz_avx2ext;
+#endif
+#endif
+#endif /* CONFIG_X86 */
+	}
+
+	/* check all the functions */
+	for (j = 0; j < nf; ++j) {
+		/* compute parity */
+		f[j](nd, size, v);
+
+		/* check it */
+		for (i = 0; i < np; ++i) {
+			if (memcmp(v[nd + np + i], v[nd + i], size) != 0) {
+				/* LCOV_EXCL_START */
+				goto bail;
+				/* LCOV_EXCL_STOP */
+			}
+		}
+	}
+
+	free(v_alloc);
+	free(v);
+	return 0;
+
+bail:
+	/* LCOV_EXCL_START */
+	free(v_alloc);
+	free(v);
+	return -1;
+	/* LCOV_EXCL_STOP */
+}
+
--- a/raid/test.h
+++ b/raid/test.h
@ -0,0 +1,68 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __RAID_TEST_H
+#define __RAID_TEST_H
+
+/**
+ * Tests insertion function.
+ *
+ * Test raid_insert() with all the possible combinations of elements to insert.
+ *
+ * Returns 0 on success.
+ */
+int raid_test_insert(void);
+
+/**
+ * Tests sorting function.
+ *
+ * Test raid_sort() with all the possible combinations of elements to sort.
+ *
+ * Returns 0 on success.
+ */
+int raid_test_sort(void);
+
+/**
+ * Tests combination functions.
+ *
+ * Tests combination_first() and combination_next() for all the parity levels.
+ *
+ * Returns 0 on success.
+ */
+int raid_test_combo(void);
+
+/**
+ * Tests recovering functions.
+ *
+ * All the recovering functions are tested with all the combinations
+ * of failing disks and recovering parities.
+ *
+ * Take care that the test time grows exponentially with the number of disks.
+ *
+ * Returns 0 on success.
+ */
+int raid_test_rec(unsigned mode, int nd, size_t size);
+
+/**
+ * Tests parity generation functions.
+ *
+ * All the parity generation functions are tested with the specified
+ * number of disks.
+ *
+ * Returns 0 on success.
+ */
+int raid_test_par(unsigned mode, int nd, size_t size);
+
+#endif
+
--- a/raid/x86.c
+++ b/raid/x86.c
--- a/raid/x86z.c
+++ b/raid/x86z.c
@ -0,0 +1,255 @@
+/*
+ * Copyright (C) 2013 Andrea Mazzoleni
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include "internal.h"
+
+#if defined(CONFIG_X86) && defined(CONFIG_SSE2)
+static const struct gfzconst16 {
+	uint8_t poly[16];
+	uint8_t half[16];
+	uint8_t low7[16];
+} gfzconst16 __aligned(64) =
+{
+	{
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
+		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d
+	},
+	{
+		0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e,
+		0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e
+	},
+	{
+		0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
+		0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f
+	}
+};
+#endif
+
+#if defined(CONFIG_X86) && defined(CONFIG_SSE2)
+/*
+ * GENz (triple parity with powers of 2^-1) SSE2 implementation
+ */
+void raid_genz_sse2(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t**)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	int d, l;
+	size_t i;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+
+	raid_sse_begin();
+
+	asm volatile ("movdqa %0,%%xmm7" : : "m" (gfzconst16.poly[0]));
+	asm volatile ("movdqa %0,%%xmm3" : : "m" (gfzconst16.half[0]));
+	asm volatile ("movdqa %0,%%xmm6" : : "m" (gfzconst16.low7[0]));
+
+	for (i = 0; i < size; i += 16) {
+		asm volatile ("movdqa %0,%%xmm0" : : "m" (v[l][i]));
+		asm volatile ("movdqa %xmm0,%xmm1");
+		asm volatile ("movdqa %xmm0,%xmm2");
+		for (d = l - 1; d >= 0; --d) {
+			asm volatile ("pxor %xmm4,%xmm4");
+			asm volatile ("pcmpgtb %xmm1,%xmm4");
+			asm volatile ("paddb %xmm1,%xmm1");
+			asm volatile ("pand %xmm7,%xmm4");
+			asm volatile ("pxor %xmm4,%xmm1");
+
+			asm volatile ("movdqa %xmm2,%xmm4");
+			asm volatile ("pxor %xmm5,%xmm5");
+			asm volatile ("psllw $7,%xmm4");
+			asm volatile ("psrlw $1,%xmm2");
+			asm volatile ("pcmpgtb %xmm4,%xmm5");
+			asm volatile ("pand %xmm6,%xmm2");
+			asm volatile ("pand %xmm3,%xmm5");
+			asm volatile ("pxor %xmm5,%xmm2");
+
+			asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
+			asm volatile ("pxor %xmm4,%xmm0");
+			asm volatile ("pxor %xmm4,%xmm1");
+			asm volatile ("pxor %xmm4,%xmm2");
+		}
+		asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
+		asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
+		asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
+	}
+
+	raid_sse_end();
+}
+#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_SSE2)
+/*
+ * GENz (triple parity with powers of 2^-1) SSE2 implementation
+ *
+ * Note that it uses 16 registers, meaning that x64 is required.
+ */
+void raid_genz_sse2ext(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t**)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	int d, l;
+	size_t i;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+
+	raid_sse_begin();
+
+	asm volatile ("movdqa %0,%%xmm7" : : "m" (gfzconst16.poly[0]));
+	asm volatile ("movdqa %0,%%xmm3" : : "m" (gfzconst16.half[0]));
+	asm volatile ("movdqa %0,%%xmm11" : : "m" (gfzconst16.low7[0]));
+
+	for (i = 0; i < size; i += 32) {
+		asm volatile ("movdqa %0,%%xmm0" : : "m" (v[l][i]));
+		asm volatile ("movdqa %0,%%xmm8" : : "m" (v[l][i + 16]));
+		asm volatile ("movdqa %xmm0,%xmm1");
+		asm volatile ("movdqa %xmm8,%xmm9");
+		asm volatile ("movdqa %xmm0,%xmm2");
+		asm volatile ("movdqa %xmm8,%xmm10");
+		for (d = l - 1; d >= 0; --d) {
+			asm volatile ("movdqa %xmm2,%xmm6");
+			asm volatile ("movdqa %xmm10,%xmm14");
+			asm volatile ("pxor %xmm4,%xmm4");
+			asm volatile ("pxor %xmm12,%xmm12");
+			asm volatile ("pxor %xmm5,%xmm5");
+			asm volatile ("pxor %xmm13,%xmm13");
+			asm volatile ("psllw $7,%xmm6");
+			asm volatile ("psllw $7,%xmm14");
+			asm volatile ("psrlw $1,%xmm2");
+			asm volatile ("psrlw $1,%xmm10");
+			asm volatile ("pcmpgtb %xmm1,%xmm4");
+			asm volatile ("pcmpgtb %xmm9,%xmm12");
+			asm volatile ("pcmpgtb %xmm6,%xmm5");
+			asm volatile ("pcmpgtb %xmm14,%xmm13");
+			asm volatile ("paddb %xmm1,%xmm1");
+			asm volatile ("paddb %xmm9,%xmm9");
+			asm volatile ("pand %xmm11,%xmm2");
+			asm volatile ("pand %xmm11,%xmm10");
+			asm volatile ("pand %xmm7,%xmm4");
+			asm volatile ("pand %xmm7,%xmm12");
+			asm volatile ("pand %xmm3,%xmm5");
+			asm volatile ("pand %xmm3,%xmm13");
+			asm volatile ("pxor %xmm4,%xmm1");
+			asm volatile ("pxor %xmm12,%xmm9");
+			asm volatile ("pxor %xmm5,%xmm2");
+			asm volatile ("pxor %xmm13,%xmm10");
+
+			asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
+			asm volatile ("movdqa %0,%%xmm12" : : "m" (v[d][i + 16]));
+			asm volatile ("pxor %xmm4,%xmm0");
+			asm volatile ("pxor %xmm4,%xmm1");
+			asm volatile ("pxor %xmm4,%xmm2");
+			asm volatile ("pxor %xmm12,%xmm8");
+			asm volatile ("pxor %xmm12,%xmm9");
+			asm volatile ("pxor %xmm12,%xmm10");
+		}
+		asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
+		asm volatile ("movntdq %%xmm8,%0" : "=m" (p[i + 16]));
+		asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
+		asm volatile ("movntdq %%xmm9,%0" : "=m" (q[i + 16]));
+		asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
+		asm volatile ("movntdq %%xmm10,%0" : "=m" (r[i + 16]));
+	}
+
+	raid_sse_end();
+}
+#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_AVX2)
+/*
+ * GENz (triple parity with powers of 2^-1) AVX2 implementation
+ *
+ * Note that it uses 16 registers, meaning that x64 is required.
+ */
+void raid_genz_avx2ext(int nd, size_t size, void **vv)
+{
+	uint8_t **v = (uint8_t**)vv;
+	uint8_t *p;
+	uint8_t *q;
+	uint8_t *r;
+	int d, l;
+	size_t i;
+
+	l = nd - 1;
+	p = v[nd];
+	q = v[nd + 1];
+	r = v[nd + 2];
+
+	raid_avx_begin();
+
+	asm volatile ("vbroadcasti128 %0,%%ymm7" : : "m" (gfzconst16.poly[0]));
+	asm volatile ("vbroadcasti128 %0,%%ymm3" : : "m" (gfzconst16.half[0]));
+	asm volatile ("vbroadcasti128 %0,%%ymm11" : : "m" (gfzconst16.low7[0]));
+	asm volatile ("vpxor %ymm15,%ymm15,%ymm15");
+
+	for (i = 0; i < size; i += 64) {
+		asm volatile ("vmovdqa %0,%%ymm0" : : "m" (v[l][i]));
+		asm volatile ("vmovdqa %0,%%ymm8" : : "m" (v[l][i + 32]));
+		asm volatile ("vmovdqa %ymm0,%ymm1");
+		asm volatile ("vmovdqa %ymm8,%ymm9");
+		asm volatile ("vmovdqa %ymm0,%ymm2");
+		asm volatile ("vmovdqa %ymm8,%ymm10");
+		for (d = l - 1; d >= 0; --d) {
+			asm volatile ("vpsllw $7,%ymm2,%ymm6");
+			asm volatile ("vpsllw $7,%ymm10,%ymm14");
+			asm volatile ("vpsrlw $1,%ymm2,%ymm2");
+			asm volatile ("vpsrlw $1,%ymm10,%ymm10");
+			asm volatile ("vpcmpgtb %ymm1,%ymm15,%ymm4");
+			asm volatile ("vpcmpgtb %ymm9,%ymm15,%ymm12");
+			asm volatile ("vpcmpgtb %ymm6,%ymm15,%ymm5");
+			asm volatile ("vpcmpgtb %ymm14,%ymm15,%ymm13");
+			asm volatile ("vpaddb %ymm1,%ymm1,%ymm1");
+			asm volatile ("vpaddb %ymm9,%ymm9,%ymm9");
+			asm volatile ("vpand %ymm11,%ymm2,%ymm2");
+			asm volatile ("vpand %ymm11,%ymm10,%ymm10");
+			asm volatile ("vpand %ymm7,%ymm4,%ymm4");
+			asm volatile ("vpand %ymm7,%ymm12,%ymm12");
+			asm volatile ("vpand %ymm3,%ymm5,%ymm5");
+			asm volatile ("vpand %ymm3,%ymm13,%ymm13");
+			asm volatile ("vpxor %ymm4,%ymm1,%ymm1");
+			asm volatile ("vpxor %ymm12,%ymm9,%ymm9");
+			asm volatile ("vpxor %ymm5,%ymm2,%ymm2");
+			asm volatile ("vpxor %ymm13,%ymm10,%ymm10");
+
+			asm volatile ("vmovdqa %0,%%ymm4" : : "m" (v[d][i]));
+			asm volatile ("vmovdqa %0,%%ymm12" : : "m" (v[d][i + 32]));
+			asm volatile ("vpxor %ymm4,%ymm0,%ymm0");
+			asm volatile ("vpxor %ymm4,%ymm1,%ymm1");
+			asm volatile ("vpxor %ymm4,%ymm2,%ymm2");
+			asm volatile ("vpxor %ymm12,%ymm8,%ymm8");
+			asm volatile ("vpxor %ymm12,%ymm9,%ymm9");
+			asm volatile ("vpxor %ymm12,%ymm10,%ymm10");
+		}
+		asm volatile ("vmovntdq %%ymm0,%0" : "=m" (p[i]));
+		asm volatile ("vmovntdq %%ymm8,%0" : "=m" (p[i + 32]));
+		asm volatile ("vmovntdq %%ymm1,%0" : "=m" (q[i]));
+		asm volatile ("vmovntdq %%ymm9,%0" : "=m" (q[i + 32]));
+		asm volatile ("vmovntdq %%ymm2,%0" : "=m" (r[i]));
+		asm volatile ("vmovntdq %%ymm10,%0" : "=m" (r[i + 32]));
+	}
+
+	raid_avx_end();
+}
+#endif
+