snapraid

2025-02-22 00:00:03 +03:00 · 2018-11-23 00:44:20 -05:00 · 2018-11-23 00:44:20 -05:00 · c416528eaa
commit c416528eaa
parent cc6479303f
23 changed files with 21844 additions and 1 deletions
--- a/2
+++ b/2
@ -4,7 +4,7 @@ INSTALL=install
 CFLAGS+=-std=gnu89 -O2 -g -MMD -Wall				\
 	-Wno-pointer-sign					\
 	-fno-strict-aliasing					\
-	-I. -Iinclude						\
+	-I. -Iinclude -Iraid					\
 	-D_FILE_OFFSET_BITS=64					\
 	-D_GNU_SOURCE						\
 	-D_LGPL_SOURCE						\
--- a/bcachefs.c
+++ b/bcachefs.c
@ -21,6 +21,8 @@
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <raid/raid.h>
 #include "cmds.h"
 static void usage(void)
@ -141,6 +143,8 @@ static int data_cmds(int argc, char *argv[])
 int main(int argc, char *argv[])
 {
 	raid_init();
 	full_cmd = argv[0];
 	setvbuf(stdout, NULL, _IOLBF, 0);
--- a/raid/COPYING
+++ b/raid/COPYING
@ -0,0 +1,339 @@
 		    GNU GENERAL PUBLIC LICENSE
 		       Version 2, June 1991
 Copyright (C) 1989, 1991 Free Software Foundation, Inc.
                          675 Mass Ave, Cambridge, MA 02139, USA
 Everyone is permitted to copy and distribute verbatim copies
 of this license document, but changing it is not allowed.
 			    Preamble
  The licenses for most software are designed to take away your
 freedom to share and change it.  By contrast, the GNU General Public
 License is intended to guarantee your freedom to share and change free
 software--to make sure the software is free for all its users.  This
 General Public License applies to most of the Free Software
 Foundation's software and to any other program whose authors commit to
 using it.  (Some other Free Software Foundation software is covered by
 the GNU Library General Public License instead.)  You can apply it to
 your programs, too.
  When we speak of free software, we are referring to freedom, not
 price.  Our General Public Licenses are designed to make sure that you
 have the freedom to distribute copies of free software (and charge for
 this service if you wish), that you receive source code or can get it
 if you want it, that you can change the software or use pieces of it
 in new free programs; and that you know you can do these things.
  To protect your rights, we need to make restrictions that forbid
 anyone to deny you these rights or to ask you to surrender the rights.
 These restrictions translate to certain responsibilities for you if you
 distribute copies of the software, or if you modify it.
  For example, if you distribute copies of such a program, whether
 gratis or for a fee, you must give the recipients all the rights that
 you have.  You must make sure that they, too, receive or can get the
 source code.  And you must show them these terms so they know their
 rights.
  We protect your rights with two steps: (1) copyright the software, and
 (2) offer you this license which gives you legal permission to copy,
 distribute and/or modify the software.
  Also, for each author's protection and ours, we want to make certain
 that everyone understands that there is no warranty for this free
 software.  If the software is modified by someone else and passed on, we
 want its recipients to know that what they have is not the original, so
 that any problems introduced by others will not reflect on the original
 authors' reputations.
  Finally, any free program is threatened constantly by software
 patents.  We wish to avoid the danger that redistributors of a free
 program will individually obtain patent licenses, in effect making the
 program proprietary.  To prevent this, we have made it clear that any
 patent must be licensed for everyone's free use or not licensed at all.
  The precise terms and conditions for copying, distribution and
 modification follow.
 		    GNU GENERAL PUBLIC LICENSE
   TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION
  0. This License applies to any program or other work which contains
 a notice placed by the copyright holder saying it may be distributed
 under the terms of this General Public License.  The "Program", below,
 refers to any such program or work, and a "work based on the Program"
 means either the Program or any derivative work under copyright law:
 that is to say, a work containing the Program or a portion of it,
 either verbatim or with modifications and/or translated into another
 language.  (Hereinafter, translation is included without limitation in
 the term "modification".)  Each licensee is addressed as "you".
 Activities other than copying, distribution and modification are not
 covered by this License; they are outside its scope.  The act of
 running the Program is not restricted, and the output from the Program
 is covered only if its contents constitute a work based on the
 Program (independent of having been made by running the Program).
 Whether that is true depends on what the Program does.
  1. You may copy and distribute verbatim copies of the Program's
 source code as you receive it, in any medium, provided that you
 conspicuously and appropriately publish on each copy an appropriate
 copyright notice and disclaimer of warranty; keep intact all the
 notices that refer to this License and to the absence of any warranty;
 and give any other recipients of the Program a copy of this License
 along with the Program.
 You may charge a fee for the physical act of transferring a copy, and
 you may at your option offer warranty protection in exchange for a fee.
  2. You may modify your copy or copies of the Program or any portion
 of it, thus forming a work based on the Program, and copy and
 distribute such modifications or work under the terms of Section 1
 above, provided that you also meet all of these conditions:
    a) You must cause the modified files to carry prominent notices
    stating that you changed the files and the date of any change.
    b) You must cause any work that you distribute or publish, that in
    whole or in part contains or is derived from the Program or any
    part thereof, to be licensed as a whole at no charge to all third
    parties under the terms of this License.
    c) If the modified program normally reads commands interactively
    when run, you must cause it, when started running for such
    interactive use in the most ordinary way, to print or display an
    announcement including an appropriate copyright notice and a
    notice that there is no warranty (or else, saying that you provide
    a warranty) and that users may redistribute the program under
    these conditions, and telling the user how to view a copy of this
    License.  (Exception: if the Program itself is interactive but
    does not normally print such an announcement, your work based on
    the Program is not required to print an announcement.)
 These requirements apply to the modified work as a whole.  If
 identifiable sections of that work are not derived from the Program,
 and can be reasonably considered independent and separate works in
 themselves, then this License, and its terms, do not apply to those
 sections when you distribute them as separate works.  But when you
 distribute the same sections as part of a whole which is a work based
 on the Program, the distribution of the whole must be on the terms of
 this License, whose permissions for other licensees extend to the
 entire whole, and thus to each and every part regardless of who wrote it.
 Thus, it is not the intent of this section to claim rights or contest
 your rights to work written entirely by you; rather, the intent is to
 exercise the right to control the distribution of derivative or
 collective works based on the Program.
 In addition, mere aggregation of another work not based on the Program
 with the Program (or with a work based on the Program) on a volume of
 a storage or distribution medium does not bring the other work under
 the scope of this License.
  3. You may copy and distribute the Program (or a work based on it,
 under Section 2) in object code or executable form under the terms of
 Sections 1 and 2 above provided that you also do one of the following:
    a) Accompany it with the complete corresponding machine-readable
    source code, which must be distributed under the terms of Sections
    1 and 2 above on a medium customarily used for software interchange; or,
    b) Accompany it with a written offer, valid for at least three
    years, to give any third party, for a charge no more than your
    cost of physically performing source distribution, a complete
    machine-readable copy of the corresponding source code, to be
    distributed under the terms of Sections 1 and 2 above on a medium
    customarily used for software interchange; or,
    c) Accompany it with the information you received as to the offer
    to distribute corresponding source code.  (This alternative is
    allowed only for noncommercial distribution and only if you
    received the program in object code or executable form with such
    an offer, in accord with Subsection b above.)
 The source code for a work means the preferred form of the work for
 making modifications to it.  For an executable work, complete source
 code means all the source code for all modules it contains, plus any
 associated interface definition files, plus the scripts used to
 control compilation and installation of the executable.  However, as a
 special exception, the source code distributed need not include
 anything that is normally distributed (in either source or binary
 form) with the major components (compiler, kernel, and so on) of the
 operating system on which the executable runs, unless that component
 itself accompanies the executable.
 If distribution of executable or object code is made by offering
 access to copy from a designated place, then offering equivalent
 access to copy the source code from the same place counts as
 distribution of the source code, even though third parties are not
 compelled to copy the source along with the object code.
  4. You may not copy, modify, sublicense, or distribute the Program
 except as expressly provided under this License.  Any attempt
 otherwise to copy, modify, sublicense or distribute the Program is
 void, and will automatically terminate your rights under this License.
 However, parties who have received copies, or rights, from you under
 this License will not have their licenses terminated so long as such
 parties remain in full compliance.
  5. You are not required to accept this License, since you have not
 signed it.  However, nothing else grants you permission to modify or
 distribute the Program or its derivative works.  These actions are
 prohibited by law if you do not accept this License.  Therefore, by
 modifying or distributing the Program (or any work based on the
 Program), you indicate your acceptance of this License to do so, and
 all its terms and conditions for copying, distributing or modifying
 the Program or works based on it.
  6. Each time you redistribute the Program (or any work based on the
 Program), the recipient automatically receives a license from the
 original licensor to copy, distribute or modify the Program subject to
 these terms and conditions.  You may not impose any further
 restrictions on the recipients' exercise of the rights granted herein.
 You are not responsible for enforcing compliance by third parties to
 this License.
  7. If, as a consequence of a court judgment or allegation of patent
 infringement or for any other reason (not limited to patent issues),
 conditions are imposed on you (whether by court order, agreement or
 otherwise) that contradict the conditions of this License, they do not
 excuse you from the conditions of this License.  If you cannot
 distribute so as to satisfy simultaneously your obligations under this
 License and any other pertinent obligations, then as a consequence you
 may not distribute the Program at all.  For example, if a patent
 license would not permit royalty-free redistribution of the Program by
 all those who receive copies directly or indirectly through you, then
 the only way you could satisfy both it and this License would be to
 refrain entirely from distribution of the Program.
 If any portion of this section is held invalid or unenforceable under
 any particular circumstance, the balance of the section is intended to
 apply and the section as a whole is intended to apply in other
 circumstances.
 It is not the purpose of this section to induce you to infringe any
 patents or other property right claims or to contest validity of any
 such claims; this section has the sole purpose of protecting the
 integrity of the free software distribution system, which is
 implemented by public license practices.  Many people have made
 generous contributions to the wide range of software distributed
 through that system in reliance on consistent application of that
 system; it is up to the author/donor to decide if he or she is willing
 to distribute software through any other system and a licensee cannot
 impose that choice.
 This section is intended to make thoroughly clear what is believed to
 be a consequence of the rest of this License.
  8. If the distribution and/or use of the Program is restricted in
 certain countries either by patents or by copyrighted interfaces, the
 original copyright holder who places the Program under this License
 may add an explicit geographical distribution limitation excluding
 those countries, so that distribution is permitted only in or among
 countries not thus excluded.  In such case, this License incorporates
 the limitation as if written in the body of this License.
  9. The Free Software Foundation may publish revised and/or new versions
 of the General Public License from time to time.  Such new versions will
 be similar in spirit to the present version, but may differ in detail to
 address new problems or concerns.
 Each version is given a distinguishing version number.  If the Program
 specifies a version number of this License which applies to it and "any
 later version", you have the option of following the terms and conditions
 either of that version or of any later version published by the Free
 Software Foundation.  If the Program does not specify a version number of
 this License, you may choose any version ever published by the Free Software
 Foundation.
  10. If you wish to incorporate parts of the Program into other free
 programs whose distribution conditions are different, write to the author
 to ask for permission.  For software which is copyrighted by the Free
 Software Foundation, write to the Free Software Foundation; we sometimes
 make exceptions for this.  Our decision will be guided by the two goals
 of preserving the free status of all derivatives of our free software and
 of promoting the sharing and reuse of software generally.
 			    NO WARRANTY
  11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY
 FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW.  EXCEPT WHEN
 OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES
 PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED
 OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
 MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.  THE ENTIRE RISK AS
 TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU.  SHOULD THE
 PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING,
 REPAIR OR CORRECTION.
  12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
 WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR
 REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES,
 INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
 OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED
 TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY
 YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER
 PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE
 POSSIBILITY OF SUCH DAMAGES.
 		     END OF TERMS AND CONDITIONS
 	Appendix: How to Apply These Terms to Your New Programs
  If you develop a new program, and you want it to be of the greatest
 possible use to the public, the best way to achieve this is to make it
 free software which everyone can redistribute and change under these terms.
  To do so, attach the following notices to the program.  It is safest
 to attach them to the start of each source file to most effectively
 convey the exclusion of warranty; and each file should have at least
 the "copyright" line and a pointer to where the full notice is found.
    <one line to give the program's name and a brief idea of what it does.>
    Copyright (C) 19yy  <name of author>
    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.
    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.
    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 Also add information on how to contact you by electronic and paper mail.
 If the program is interactive, make it output a short notice like this
 when it starts in an interactive mode:
    Gnomovision version 69, Copyright (C) 19yy name of author
    Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
    This is free software, and you are welcome to redistribute it
    under certain conditions; type `show c' for details.
 The hypothetical commands `show w' and `show c' should show the appropriate
 parts of the General Public License.  Of course, the commands you use may
 be called something other than `show w' and `show c'; they could even be
 mouse-clicks or menu items--whatever suits your program.
 You should also get your employer (if you work as a programmer) or your
 school, if any, to sign a "copyright disclaimer" for the program, if
 necessary.  Here is a sample; alter the names:
  Yoyodyne, Inc., hereby disclaims all copyright interest in the program
  `Gnomovision' (which makes passes at compilers) written by James Hacker.
  <signature of Ty Coon>, 1 April 1989
  Ty Coon, President of Vice
 This General Public License does not permit incorporating your program into
 proprietary programs.  If your program is a subroutine library, you may
 consider it more useful to permit linking proprietary applications with the
 library.  If this is what you want to do, use the GNU Library General
 Public License instead of this License.
--- a/raid/check.c
+++ b/raid/check.c
@ -0,0 +1,185 @@
 /*
 * Copyright (C) 2015 Andrea Mazzoleni
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #include "internal.h"
 #include "combo.h"
 #include "gf.h"
 /**
 * Validate the provided failed blocks.
 *
 * This function checks if the specified failed blocks satisfy the redundancy
 * information using the data from the known valid parity blocks.
 *
 * It's similar at raid_check(), just with a different format for arguments.
 *
 * The number of failed blocks @nr must be strictly less than the number of
 * parities @nv, because you need one more parity to validate the recovering.
 *
 * No data or parity blocks are modified.
 *
 * @nr Number of failed data blocks.
 * @id[] Vector of @nr indexes of the failed data blocks.
 *   The indexes start from 0. They must be in order.
 * @nv Number of valid parity blocks.
 * @ip[] Vector of @nv indexes of the valid parity blocks.
 *   The indexes start from 0. They must be in order.
 * @nd Number of data blocks.
 * @size Size of the blocks pointed by @v. It must be a multipler of 64.
 * @v Vector of pointers to the blocks of data and parity.
 *   It has (@nd + @ip[@nv - 1] + 1) elements. The starting elements are the
 *   blocks for data, following with the parity blocks.
 *   Each block has @size bytes. 
 * @return 0 if the check is satisfied. -1 otherwise.
 */
 static int raid_validate(int nr, int *id, int nv, int *ip, int nd, size_t size, void **vv)
 {
 	uint8_t **v = (uint8_t **)vv;
 	const uint8_t *T[RAID_PARITY_MAX][RAID_PARITY_MAX];
 	uint8_t G[RAID_PARITY_MAX * RAID_PARITY_MAX];
 	uint8_t V[RAID_PARITY_MAX * RAID_PARITY_MAX];
 	size_t i;
 	int j, k, l;
 	BUG_ON(nr >= nv);
 	/* setup the coefficients matrix */
 	for (j = 0; j < nr; ++j)
 		for (k = 0; k < nr; ++k)
 			G[j * nr + k] = A(ip[j], id[k]);
 	/* invert it to solve the system of linear equations */
 	raid_invert(G, V, nr);
 	/* get multiplication tables */
 	for (j = 0; j < nr; ++j)
 		for (k = 0; k < nr; ++k)
 			T[j][k] = table(V[j * nr + k]);
 	/* check all positions */
 	for (i = 0; i < size; ++i) {
 		uint8_t p[RAID_PARITY_MAX];
 		/* get parity */
 		for (j = 0; j < nv; ++j)
 			p[j] = v[nd + ip[j]][i];
 		/* compute delta parity, skipping broken disks */
 		for (j = 0, k = 0; j < nd; ++j) {
 			uint8_t b;
 			/* skip broken disks */
 			if (k < nr && id[k] == j) {
 				++k;
 				continue;
 			}
 			b = v[j][i];
 			for (l = 0; l < nv; ++l)
 				p[l] ^= gfmul[b][gfgen[ip[l]][j]];
 		}
 		/* reconstruct data */
 		for (j = 0; j < nr; ++j) {
 			uint8_t b = 0;
 			int idj = id[j];
 			/* recompute the data */
 			for (k = 0; k < nr; ++k)
 				b ^= T[j][k][p[k]];
 			/* add the parity contribution of the reconstructed data */
 			for (l = nr; l < nv; ++l)
 				p[l] ^= gfmul[b][gfgen[ip[l]][idj]];
 		}
 		/* check that the final parity is 0 */
 		for (l = nr; l < nv; ++l)
 			if (p[l] != 0)
 				return -1;
 	}
 	return 0;
 }
 int raid_check(int nr, int *ir, int nd, int np, size_t size, void **v)
 {
 	/* valid parity index */
 	int ip[RAID_PARITY_MAX];
 	int vp;
 	int rd;
 	int i, j;
 	/* enforce limit on size */
 	BUG_ON(size % 64 != 0);
 	/* enforce limit on number of failures */
 	BUG_ON(nr >= np); /* >= because we check with extra parity */
 	BUG_ON(np > RAID_PARITY_MAX);
 	/* enforce order in index vector */
 	BUG_ON(nr >= 2 && ir[0] >= ir[1]);
 	BUG_ON(nr >= 3 && ir[1] >= ir[2]);
 	BUG_ON(nr >= 4 && ir[2] >= ir[3]);
 	BUG_ON(nr >= 5 && ir[3] >= ir[4]);
 	BUG_ON(nr >= 6 && ir[4] >= ir[5]);
 	/* enforce limit on index vector */
 	BUG_ON(nr > 0 && ir[nr-1] >= nd + np);
 	/* count failed data disk */
 	rd = 0;
 	while (rd < nr && ir[rd] < nd)
 		++rd;
 	/* put valid parities into ip[] */
 	vp = 0;
 	for (i = rd, j = 0; j < np; ++j) {
 		/* if parity is failed */
 		if (i < nr && ir[i] == nd + j) {
 			/* skip broken parity */
 			++i;
 		} else {
 			/* store valid parity */
 			ip[vp] = j;
 			++vp;
 		}
 	}
 	return raid_validate(rd, ir, vp, ip, nd, size, v);
 }
 int raid_scan(int *ir, int nd, int np, size_t size, void **v)
 {
 	int r;
 	/* check the special case of no failure */
 	if (np != 0 && raid_check(0, 0, nd, np, size, v) == 0)
 		return 0;
 	/* for each number of possible failures */
 	for (r = 1; r < np; ++r) {
 		/* try all combinations of r failures on n disks */
 		combination_first(r, nd + np, ir);
 		do {
 			/* verify if the combination is a valid one */
 			if (raid_check(r, ir, nd, np, size, v) == 0)
 				return r;
 		} while (combination_next(r, nd + np, ir));
 	}
 	/* no solution found */
 	return -1;
 }
--- a/raid/combo.h
+++ b/raid/combo.h
@ -0,0 +1,155 @@
 /*
 * Copyright (C) 2013 Andrea Mazzoleni
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #ifndef __RAID_COMBO_H
 #define __RAID_COMBO_H
 #include <assert.h>
 /**
 * Get the first permutation with repetition of r of n elements.
 *
 * Typical use is with permutation_next() in the form :
 *
 * int i[R];
 * permutation_first(R, N, i);
 * do {
 *    code using i[0], i[1], ..., i[R-1]
 * } while (permutation_next(R, N, i));
 *
 * It's equivalent at the code :
 *
 * for(i[0]=0;i[0]<N;++i[0])
 *     for(i[1]=0;i[1]<N;++i[1])
 *        ...
 *            for(i[R-2]=0;i[R-2]<N;++i[R-2])
 *                for(i[R-1]=0;i[R-1]<N;++i[R-1])
 *                    code using i[0], i[1], ..., i[R-1]
 */
 static __always_inline void permutation_first(int r, int n, int *c)
 {
 	int i;
 	(void)n; /* unused, but kept for clarity */
 	assert(0 < r && r <= n);
 	for (i = 0; i < r; ++i)
 		c[i] = 0;
 }
 /**
 * Get the next permutation with repetition of r of n elements.
 * Return ==0 when finished.
 */
 static __always_inline int permutation_next(int r, int n, int *c)
 {
 	int i = r - 1; /* present position */
 recurse:
 	/* next element at position i */
 	++c[i];
 	/* if the position has reached the max */
 	if (c[i] >= n) {
 		/* if we are at the first level, we have finished */
 		if (i == 0)
 			return 0;
 		/* increase the previous position */
 		--i;
 		goto recurse;
 	}
 	++i;
 	/* initialize all the next positions, if any */
 	while (i < r) {
 		c[i] = 0;
 		++i;
 	}
 	return 1;
 }
 /**
 * Get the first combination without repetition of r of n elements.
 *
 * Typical use is with combination_next() in the form :
 *
 * int i[R];
 * combination_first(R, N, i);
 * do {
 *    code using i[0], i[1], ..., i[R-1]
 * } while (combination_next(R, N, i));
 *
 * It's equivalent at the code :
 *
 * for(i[0]=0;i[0]<N-(R-1);++i[0])
 *     for(i[1]=i[0]+1;i[1]<N-(R-2);++i[1])
 *        ...
 *            for(i[R-2]=i[R-3]+1;i[R-2]<N-1;++i[R-2])
 *                for(i[R-1]=i[R-2]+1;i[R-1]<N;++i[R-1])
 *                    code using i[0], i[1], ..., i[R-1]
 */
 static __always_inline void combination_first(int r, int n, int *c)
 {
 	int i;
 	(void)n; /* unused, but kept for clarity */
 	assert(0 < r && r <= n);
 	for (i = 0; i < r; ++i)
 		c[i] = i;
 }
 /**
 * Get the next combination without repetition of r of n elements.
 * Return ==0 when finished.
 */
 static __always_inline int combination_next(int r, int n, int *c)
 {
 	int i = r - 1; /* present position */
 	int h = n; /* high limit for this position */
 recurse:
 	/* next element at position i */
 	++c[i];
 	/* if the position has reached the max */
 	if (c[i] >= h) {
 		/* if we are at the first level, we have finished */
 		if (i == 0)
 			return 0;
 		/* increase the previous position */
 		--i;
 		--h;
 		goto recurse;
 	}
 	++i;
 	/* initialize all the next positions, if any */
 	while (i < r) {
 		/* each position start at the next value of the previous one */
 		c[i] = c[i - 1] + 1;
 		++i;
 	}
 	return 1;
 }
 #endif
--- a/raid/cpu.h
+++ b/raid/cpu.h
@ -0,0 +1,331 @@
 /*
 * Copyright (C) 2013 Andrea Mazzoleni
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #ifndef __RAID_CPU_H
 #define __RAID_CPU_H
 #ifdef CONFIG_X86
 static inline void raid_cpuid(uint32_t func_eax, uint32_t sub_ecx, uint32_t *reg)
 {
 	asm volatile (
 #if defined(__i386__) && defined(__PIC__)
 	        /* allow compilation in PIC mode saving ebx */
 		"xchgl %%ebx, %1\n"
 		"cpuid\n"
 		"xchgl %%ebx, %1\n"
 		: "=a" (reg[0]), "=r" (reg[1]), "=c" (reg[2]), "=d" (reg[3])
 		: "0" (func_eax), "2" (sub_ecx)
 #else
 		"cpuid\n"
 		: "=a" (reg[0]), "=b" (reg[1]), "=c" (reg[2]), "=d" (reg[3])
 		: "0" (func_eax), "2" (sub_ecx)
 #endif
 	);
 }
 static inline void raid_xgetbv(uint32_t* reg)
 {
 	/* get the value of the Extended Control Register ecx=0 */
 	asm volatile (
 	        /* uses a direct encoding of the XGETBV instruction as only recent */
 	        /* assemblers support it. */
 	        /* the next line is equivalent at: "xgetbv\n" */
 		".byte 0x0f, 0x01, 0xd0\n"
 		: "=a" (reg[0]), "=d" (reg[3])
 		: "c" (0)
 	);
 }
 #define CPU_VENDOR_MAX 13
 static inline void raid_cpu_info(char *vendor, unsigned *family, unsigned *model)
 {
 	uint32_t reg[4];
 	unsigned f, ef, m, em;
 	raid_cpuid(0, 0, reg);
 	((uint32_t*)vendor)[0] = reg[1];
 	((uint32_t*)vendor)[1] = reg[3];
 	((uint32_t*)vendor)[2] = reg[2];
 	vendor[12] = 0;
 	raid_cpuid(1, 0, reg);
 	f = (reg[0] >> 8) & 0xF;
 	ef = (reg[0] >> 20) & 0xFF;
 	m = (reg[0] >> 4) & 0xF;
 	em = (reg[0] >> 16) & 0xF;
 	if (strcmp(vendor, "AuthenticAMD") == 0) {
 		if (f < 15) {
 			*family = f;
 			*model = m;
 		} else {
 			*family = f + ef;
 			*model = m + (em << 4);
 		}
 	} else {
 		*family = f + ef;
 		*model = m + (em << 4);
 	}
 }
 static inline int raid_cpu_match_sse(uint32_t cpuid_1_ecx, uint32_t cpuid_1_edx)
 {
 	uint32_t reg[4];
 	raid_cpuid(1, 0, reg);
 	if ((reg[2] & cpuid_1_ecx) != cpuid_1_ecx)
 		return 0;
 	if ((reg[3] & cpuid_1_edx) != cpuid_1_edx)
 		return 0;
 	return 1;
 }
 static inline int raid_cpu_match_avx(uint32_t cpuid_1_ecx, uint32_t cpuid_7_ebx, uint32_t xcr0)
 {
 	uint32_t reg[4];
 	raid_cpuid(1, 0, reg);
 	if ((reg[2] & cpuid_1_ecx) != cpuid_1_ecx)
 		return 0;
 	raid_xgetbv(reg);
 	if ((reg[0] & xcr0) != xcr0)
 		return 0;
 	raid_cpuid(7, 0, reg);
 	if ((reg[1] & cpuid_7_ebx) != cpuid_7_ebx)
 		return 0;
 	return 1;
 }
 static inline int raid_cpu_has_sse2(void)
 {
 	/*
 	 * Intel® 64 and IA-32 Architectures Software Developer's Manual
 	 * 325462-048US September 2013
 	 *
 	 * 11.6.2 Checking for SSE/SSE2 Support
 	 * Before an application attempts to use the SSE and/or SSE2 extensions, it should check
 	 * that they are present on the processor:
 	 * 1. Check that the processor supports the CPUID instruction. Bit 21 of the EFLAGS
 	 * register can be used to check processor's support the CPUID instruction.
 	 * 2. Check that the processor supports the SSE and/or SSE2 extensions (true if
 	 * CPUID.01H:EDX.SSE[bit 25] = 1 and/or CPUID.01H:EDX.SSE2[bit 26] = 1).
 	 */
 	return raid_cpu_match_sse(
 		0,
 		1 << 26); /* SSE2 */
 }
 static inline int raid_cpu_has_ssse3(void)
 {
 	/*
 	 * Intel® 64 and IA-32 Architectures Software Developer's Manual
 	 * 325462-048US September 2013
 	 *
 	 * 12.7.2 Checking for SSSE3 Support
 	 * Before an application attempts to use the SSSE3 extensions, the application should
 	 * follow the steps illustrated in Section 11.6.2, "Checking for SSE/SSE2 Support."
 	 * Next, use the additional step provided below:
 	 * Check that the processor supports SSSE3 (if CPUID.01H:ECX.SSSE3[bit 9] = 1).
 	 */
 	return raid_cpu_match_sse(
 		1 << 9, /* SSSE3 */
 		1 << 26); /* SSE2 */
 }
 static inline int raid_cpu_has_crc32(void)
 {
 	/*
 	 * Intel® 64 and IA-32 Architectures Software Developer's Manual
 	 * 325462-048US September 2013
 	 *
 	 * 12.12.3 Checking for SSE4.2 Support
 	 * ...
 	 * Before an application attempts to use the CRC32 instruction, it must check
 	 * that the processor supports SSE4.2 (if CPUID.01H:ECX.SSE4_2[bit 20] = 1).
 	 */
 	return raid_cpu_match_sse(
 		1 << 20, /* CRC32 */
 		0);
 }
 static inline int raid_cpu_has_avx2(void)
 {
 	/*
 	 * Intel Architecture Instruction Set Extensions Programming Reference
 	 * 319433-022 October 2014
 	 *
 	 * 14.3 Detection of AVX instructions
 	 * 1) Detect CPUID.1:ECX.OSXSAVE[bit 27] = 1 (XGETBV enabled for application use1)
 	 * 2) Issue XGETBV and verify that XCR0[2:1] = `11b' (XMM state and YMM state are enabled by OS).
 	 * 3) detect CPUID.1:ECX.AVX[bit 28] = 1 (AVX instructions supported).
 	 * (Step 3 can be done in any order relative to 1 and 2)
 	 *
 	 * 14.7.1 Detection of AVX2
 	 * Hardware support for AVX2 is indicated by CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5]=1.
 	 * Application Software must identify that hardware supports AVX, after that it must
 	 * also detect support for AVX2 by checking CPUID.(EAX=07H, ECX=0H):EBX.AVX2[bit 5].
 	 */
 	return raid_cpu_match_avx(
 		(1 << 27) | (1 << 28), /* OSXSAVE and AVX */
 		1 << 5, /* AVX2 */
 		3 << 1); /* OS saves XMM and YMM registers */
 }
 static inline int raid_cpu_has_avx512bw(void)
 {
 	/*
 	 * Intel Architecture Instruction Set Extensions Programming Reference
 	 * 319433-022 October 2014
 	 *
 	 * 2.2 Detection of 512-bit Instruction Groups of Intel AVX-512 Family
 	 * 1) Detect CPUID.1:ECX.OSXSAVE[bit 27] = 1 (XGETBV enabled for application use)
 	 * 2) Execute XGETBV and verify that XCR0[7:5] = `111b' (OPMASK state, upper 256-bit of
 	 * ZMM0-ZMM15 and ZMM16-ZMM31 state are enabled by OS) and that XCR0[2:1] = `11b'
 	 * (XMM state and YMM state are enabled by OS).
 	 * 3) Verify both CPUID.0x7.0:EBX.AVX512F[bit 16] = 1, CPUID.0x7.0:EBX.AVX512BW[bit 30] = 1.
 	 */
 	/* note that intentionally we don't check for AVX and AVX2 */
 	/* because the documentation doesn't require that */
 	return raid_cpu_match_avx(
 		1 << 27, /* XSAVE/XGETBV */
 		(1 << 16) | (1 << 30), /* AVX512F and AVX512BW */
 		(3 << 1) | (7 << 5)); /* OS saves XMM, YMM and ZMM registers */
 }
 /**
 * Check if it's an Intel Atom CPU.
 */
 static inline int raid_cpu_is_atom(unsigned family, unsigned model)
 {
 	if (family != 6)
 		return 0;
 	/*
 	 * x86 Architecture CPUID
 	 * http://www.sandpile.org/x86/cpuid.htm
 	 *
 	 * Intel Atom
 	 * 1C (28) Atom (45 nm) with 512 KB on-die L2
 	 * 26 (38) Atom (45 nm) with 512 KB on-die L2
 	 * 36 (54) Atom (32 nm) with 512 KB on-die L2
 	 * 27 (39) Atom (32 nm) with 512 KB on-die L2
 	 * 35 (53) Atom (?? nm) with ??? KB on-die L2
 	 * 4A (74) Atom 2C (22 nm) 1 MB L2 + PowerVR (TGR)
 	 * 5A (90) Atom 4C (22 nm) 2 MB L2 + PowerVR (ANN)
 	 * 37 (55) Atom 4C (22 nm) 2 MB L2 + Intel Gen7 (BYT)
 	 * 4C (76) Atom 4C (14 nm) 2 MB L2 + Intel Gen8 (BSW)
 	 * 5D (93) Atom 4C (28 nm TSMC) 1 MB L2 + Mali (SoFIA)
 	 * 4D (77) Atom 8C (22 nm) 4 MB L2 (AVN)
 	 * ?? Atom ?C (14 nm) ? MB L2 (DVN)
 	 */
 	return model == 28 || model == 38 || model == 54
 		|| model == 39 || model == 53 || model == 74
 		|| model == 90 || model == 55 || model == 76
 		|| model == 93 || model == 77;
 }
 /**
 * Check if the processor has a slow MULT implementation.
 * If yes, it's better to use a hash not based on multiplication.
 */
 static inline int raid_cpu_has_slowmult(void)
 {
 	char vendor[CPU_VENDOR_MAX];
 	unsigned family;
 	unsigned model;
 	/*
 	 * In some cases Murmur3 based on MUL instruction,
 	 * is a LOT slower than Spooky2 based on SHIFTs.
 	 */
 	raid_cpu_info(vendor, &family, &model);
 	if (strcmp(vendor, "GenuineIntel") == 0) {
 		/*
 		 * Intel Atom (Model 28)
 		 * murmur3:378 MB/s, spooky2:3413 MB/s (x86)
 		 *
 		 * Intel Atom (Model 77)
 		 * murmur3:1311 MB/s, spooky2:4056 MB/s (x64)
 		 */
 		if (raid_cpu_is_atom(family, model))
 			return 1;
 	}
 	return 0;
 }
 /**
 * Check if the processor has a slow extended set of SSE registers.
 * If yes, it's better to limit the unroll to the firsrt 8 registers.
 */
 static inline int raid_cpu_has_slowextendedreg(void)
 {
 	char vendor[CPU_VENDOR_MAX];
 	unsigned family;
 	unsigned model;
 	/*
 	 * In some cases the PAR2 implementation using 16 SSE registers
 	 * is a LITTLE slower than the one using only the first 8 registers.
 	 * This doesn't happen for PARZ.
 	 */
 	raid_cpu_info(vendor, &family, &model);
 	if (strcmp(vendor, "AuthenticAMD") == 0) {
 		/*
 		 * AMD Bulldozer
 		 * par2_sse2:4922 MB/s, par2_sse2e:4465 MB/s
 		 */
 		if (family == 21)
 			return 1;
 	}
 	if (strcmp(vendor, "GenuineIntel") == 0) {
 		/*
 		 * Intel Atom (Model 77)
 		 * par2_sse2:5686 MB/s, par2_sse2e:5250 MB/s
 		 * parz_sse2:3100 MB/s, parz_sse2e:3400 MB/s
 		 * par3_sse3:1921 MB/s, par3_sse3e:1813 MB/s
 		 * par4_sse3:1175 MB/s, par4_sse3e:1113 MB/s
 		 * par5_sse3:876 MB/s, par5_sse3e:675 MB/s
 		 * par6_sse3:705 MB/s, par6_sse3e:529 MB/s
 		 *
 		 * Intel Atom (Model 77) "Avoton C2750"
 		 * par2_sse2:5661 MB/s, par2_sse2e:5382 MB/s
 		 * parz_sse2:3110 MB/s, parz_sse2e:3450 MB/s
 		 * par3_sse3:1769 MB/s, par3_sse3e:1856 MB/s
 		 * par4_sse3:1221 MB/s, par4_sse3e:1141 MB/s
 		 * par5_sse3:910 MB/s, par5_sse3e:675 MB/s
 		 * par6_sse3:720 MB/s, par6_sse3e:534 MB/s
 		 */
 		if (raid_cpu_is_atom(family, model))
 			return 1;
 	}
 	return 0;
 }
 #endif
 #endif
--- a/raid/gf.h
+++ b/raid/gf.h
@ -0,0 +1,137 @@
 /*
 * Copyright (C) 2013 Andrea Mazzoleni
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #ifndef __RAID_GF_H
 #define __RAID_GF_H
 /*
 * Galois field operations.
 *
 * Basic range checks are implemented using BUG_ON().
 */
 /*
 * GF a*b.
 */
 static __always_inline uint8_t mul(uint8_t a, uint8_t b)
 {
 	return gfmul[a][b];
 }
 /*
 * GF 1/a.
 * Not defined for a == 0.
 */
 static __always_inline uint8_t inv(uint8_t v)
 {
 	BUG_ON(v == 0); /* division by zero */
 	return gfinv[v];
 }
 /*
 * GF 2^a.
 */
 static __always_inline uint8_t pow2(int v)
 {
 	BUG_ON(v < 0 || v > 254); /* invalid exponent */
 	return gfexp[v];
 }
 /*
 * Gets the multiplication table for a specified value.
 */
 static __always_inline const uint8_t *table(uint8_t v)
 {
 	return gfmul[v];
 }
 /*
 * Gets the generator matrix coefficient for parity 'p' and disk 'd'.
 */
 static __always_inline uint8_t A(int p, int d)
 {
 	return gfgen[p][d];
 }
 /*
 * Dereference as uint8_t
 */
 #define v_8(p) (*(uint8_t *)&(p))
 /*
 * Dereference as uint32_t
 */
 #define v_32(p) (*(uint32_t *)&(p))
 /*
 * Dereference as uint64_t
 */
 #define v_64(p) (*(uint64_t *)&(p))
 /*
 * Multiply each byte of a uint32 by 2 in the GF(2^8).
 */
 static __always_inline uint32_t x2_32(uint32_t v)
 {
 	uint32_t mask = v & 0x80808080U;
 	mask = (mask << 1) - (mask >> 7);
 	v = (v << 1) & 0xfefefefeU;
 	v ^= mask & 0x1d1d1d1dU;
 	return v;
 }
 /*
 * Multiply each byte of a uint64 by 2 in the GF(2^8).
 */
 static __always_inline uint64_t x2_64(uint64_t v)
 {
 	uint64_t mask = v & 0x8080808080808080ULL;
 	mask = (mask << 1) - (mask >> 7);
 	v = (v << 1) & 0xfefefefefefefefeULL;
 	v ^= mask & 0x1d1d1d1d1d1d1d1dULL;
 	return v;
 }
 /*
 * Divide each byte of a uint32 by 2 in the GF(2^8).
 */
 static __always_inline uint32_t d2_32(uint32_t v)
 {
 	uint32_t mask = v & 0x01010101U;
 	mask = (mask << 8) - mask;
 	v = (v >> 1) & 0x7f7f7f7fU;
 	v ^= mask & 0x8e8e8e8eU;
 	return v;
 }
 /*
 * Divide each byte of a uint64 by 2 in the GF(2^8).
 */
 static __always_inline uint64_t d2_64(uint64_t v)
 {
 	uint64_t mask = v & 0x0101010101010101ULL;
 	mask = (mask << 8) - mask;
 	v = (v >> 1) & 0x7f7f7f7f7f7f7f7fULL;
 	v ^= mask & 0x8e8e8e8e8e8e8e8eULL;
 	return v;
 }
 #endif
--- a/raid/helper.c
+++ b/raid/helper.c
@ -0,0 +1,94 @@
 /*
 * Copyright (C) 2013 Andrea Mazzoleni
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #include "internal.h"
 #define RAID_SWAP(a, b) \
 	do { \
 		if (v[a] > v[b]) { \
 			int t = v[a]; \
 			v[a] = v[b]; \
 			v[b] = t; \
 		} \
 	} while (0)
 void raid_sort(int n, int *v)
 {
 	/* sorting networks generated with Batcher's Merge-Exchange */
 	switch (n) {
 	case 2:
 		RAID_SWAP(0, 1);
 		break;
 	case 3:
 		RAID_SWAP(0, 2);
 		RAID_SWAP(0, 1);
 		RAID_SWAP(1, 2);
 		break;
 	case 4:
 		RAID_SWAP(0, 2);
 		RAID_SWAP(1, 3);
 		RAID_SWAP(0, 1);
 		RAID_SWAP(2, 3);
 		RAID_SWAP(1, 2);
 		break;
 	case 5:
 		RAID_SWAP(0, 4);
 		RAID_SWAP(0, 2);
 		RAID_SWAP(1, 3);
 		RAID_SWAP(2, 4);
 		RAID_SWAP(0, 1);
 		RAID_SWAP(2, 3);
 		RAID_SWAP(1, 4);
 		RAID_SWAP(1, 2);
 		RAID_SWAP(3, 4);
 		break;
 	case 6:
 		RAID_SWAP(0, 4);
 		RAID_SWAP(1, 5);
 		RAID_SWAP(0, 2);
 		RAID_SWAP(1, 3);
 		RAID_SWAP(2, 4);
 		RAID_SWAP(3, 5);
 		RAID_SWAP(0, 1);
 		RAID_SWAP(2, 3);
 		RAID_SWAP(4, 5);
 		RAID_SWAP(1, 4);
 		RAID_SWAP(1, 2);
 		RAID_SWAP(3, 4);
 		break;
 	}
 }
 void raid_insert(int n, int *v, int i)
 {
 	/* we don't use binary search because this is intended */
 	/* for very small vectors and we want to optimize the case */
 	/* of elements inserted already in order */
 	/* insert at the end */
 	v[n] = i;
 	/* swap until in the correct position */
 	while (n > 0 && v[n - 1] > v[n]) {
 		/* swap */
 		int t = v[n - 1];
 		v[n - 1] = v[n];
 		v[n] = t;
 		/* previous position */
 		--n;
 	}
 }
--- a/raid/helper.h
+++ b/raid/helper.h
@ -0,0 +1,43 @@
 /*
 * Copyright (C) 2013 Andrea Mazzoleni
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #ifndef __RAID_HELPER_H
 #define __RAID_HELPER_H
 /**
 * Inserts an integer in a sorted vector.
 *
 * This function can be used to insert indexes in order, ready to be used for
 * calling raid_rec().
 *
 * @n Number of integers currently in the vector.
 * @v Vector of integers already sorted.
 *   It must have extra space for the new elemet at the end.
 * @i Value to insert.
 */
 void raid_insert(int n, int *v, int i);
 /**
 * Sorts a small vector of integers.
 *
 * If you have indexes not in order, you can use this function to sort them
 * before calling raid_rec().
 *
 * @n Number of integers. No more than RAID_PARITY_MAX.
 * @v Vector of integers.
 */
 void raid_sort(int n, int *v);
 #endif
--- a/raid/int.c
+++ b/raid/int.c
@ -0,0 +1,556 @@
 /*
 * Copyright (C) 2013 Andrea Mazzoleni
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #include "internal.h"
 #include "gf.h"
 /*
 * GEN1 (RAID5 with xor) 32bit C implementation
 */
 void raid_gen1_int32(int nd, size_t size, void **vv)
 {
 	uint8_t **v = (uint8_t **)vv;
 	uint8_t *p;
 	int d, l;
 	size_t i;
 	uint32_t p0;
 	uint32_t p1;
 	l = nd - 1;
 	p = v[nd];
 	for (i = 0; i < size; i += 8) {
 		p0 = v_32(v[l][i]);
 		p1 = v_32(v[l][i + 4]);
 		for (d = l - 1; d >= 0; --d) {
 			p0 ^= v_32(v[d][i]);
 			p1 ^= v_32(v[d][i + 4]);
 		}
 		v_32(p[i]) = p0;
 		v_32(p[i + 4]) = p1;
 	}
 }
 /*
 * GEN1 (RAID5 with xor) 64bit C implementation
 */
 void raid_gen1_int64(int nd, size_t size, void **vv)
 {
 	uint8_t **v = (uint8_t **)vv;
 	uint8_t *p;
 	int d, l;
 	size_t i;
 	uint64_t p0;
 	uint64_t p1;
 	l = nd - 1;
 	p = v[nd];
 	for (i = 0; i < size; i += 16) {
 		p0 = v_64(v[l][i]);
 		p1 = v_64(v[l][i + 8]);
 		for (d = l - 1; d >= 0; --d) {
 			p0 ^= v_64(v[d][i]);
 			p1 ^= v_64(v[d][i + 8]);
 		}
 		v_64(p[i]) = p0;
 		v_64(p[i + 8]) = p1;
 	}
 }
 /*
 * GEN2 (RAID6 with powers of 2) 32bit C implementation
 */
 void raid_gen2_int32(int nd, size_t size, void **vv)
 {
 	uint8_t **v = (uint8_t **)vv;
 	uint8_t *p;
 	uint8_t *q;
 	int d, l;
 	size_t i;
 	uint32_t d0, q0, p0;
 	uint32_t d1, q1, p1;
 	l = nd - 1;
 	p = v[nd];
 	q = v[nd + 1];
 	for (i = 0; i < size; i += 8) {
 		q0 = p0 = v_32(v[l][i]);
 		q1 = p1 = v_32(v[l][i + 4]);
 		for (d = l - 1; d >= 0; --d) {
 			d0 = v_32(v[d][i]);
 			d1 = v_32(v[d][i + 4]);
 			p0 ^= d0;
 			p1 ^= d1;
 			q0 = x2_32(q0);
 			q1 = x2_32(q1);
 			q0 ^= d0;
 			q1 ^= d1;
 		}
 		v_32(p[i]) = p0;
 		v_32(p[i + 4]) = p1;
 		v_32(q[i]) = q0;
 		v_32(q[i + 4]) = q1;
 	}
 }
 /*
 * GEN2 (RAID6 with powers of 2) 64bit C implementation
 */
 void raid_gen2_int64(int nd, size_t size, void **vv)
 {
 	uint8_t **v = (uint8_t **)vv;
 	uint8_t *p;
 	uint8_t *q;
 	int d, l;
 	size_t i;
 	uint64_t d0, q0, p0;
 	uint64_t d1, q1, p1;
 	l = nd - 1;
 	p = v[nd];
 	q = v[nd + 1];
 	for (i = 0; i < size; i += 16) {
 		q0 = p0 = v_64(v[l][i]);
 		q1 = p1 = v_64(v[l][i + 8]);
 		for (d = l - 1; d >= 0; --d) {
 			d0 = v_64(v[d][i]);
 			d1 = v_64(v[d][i + 8]);
 			p0 ^= d0;
 			p1 ^= d1;
 			q0 = x2_64(q0);
 			q1 = x2_64(q1);
 			q0 ^= d0;
 			q1 ^= d1;
 		}
 		v_64(p[i]) = p0;
 		v_64(p[i + 8]) = p1;
 		v_64(q[i]) = q0;
 		v_64(q[i + 8]) = q1;
 	}
 }
 /*
 * GEN3 (triple parity with Cauchy matrix) 8bit C implementation
 *
 * Note that instead of a generic multiplication table, likely resulting
 * in multiple cache misses, a precomputed table could be used.
 * But this is only a kind of reference function, and we are not really
 * interested in speed.
 */
 void raid_gen3_int8(int nd, size_t size, void **vv)
 {
 	uint8_t **v = (uint8_t **)vv;
 	uint8_t *p;
 	uint8_t *q;
 	uint8_t *r;
 	int d, l;
 	size_t i;
 	uint8_t d0, r0, q0, p0;
 	l = nd - 1;
 	p = v[nd];
 	q = v[nd + 1];
 	r = v[nd + 2];
 	for (i = 0; i < size; i += 1) {
 		p0 = q0 = r0 = 0;
 		for (d = l; d > 0; --d) {
 			d0 = v_8(v[d][i]);
 			p0 ^= d0;
 			q0 ^= gfmul[d0][gfgen[1][d]];
 			r0 ^= gfmul[d0][gfgen[2][d]];
 		}
 		/* first disk with all coefficients at 1 */
 		d0 = v_8(v[0][i]);
 		p0 ^= d0;
 		q0 ^= d0;
 		r0 ^= d0;
 		v_8(p[i]) = p0;
 		v_8(q[i]) = q0;
 		v_8(r[i]) = r0;
 	}
 }
 /*
 * GEN4 (quad parity with Cauchy matrix) 8bit C implementation
 *
 * Note that instead of a generic multiplication table, likely resulting
 * in multiple cache misses, a precomputed table could be used.
 * But this is only a kind of reference function, and we are not really
 * interested in speed.
 */
 void raid_gen4_int8(int nd, size_t size, void **vv)
 {
 	uint8_t **v = (uint8_t **)vv;
 	uint8_t *p;
 	uint8_t *q;
 	uint8_t *r;
 	uint8_t *s;
 	int d, l;
 	size_t i;
 	uint8_t d0, s0, r0, q0, p0;
 	l = nd - 1;
 	p = v[nd];
 	q = v[nd + 1];
 	r = v[nd + 2];
 	s = v[nd + 3];
 	for (i = 0; i < size; i += 1) {
 		p0 = q0 = r0 = s0 = 0;
 		for (d = l; d > 0; --d) {
 			d0 = v_8(v[d][i]);
 			p0 ^= d0;
 			q0 ^= gfmul[d0][gfgen[1][d]];
 			r0 ^= gfmul[d0][gfgen[2][d]];
 			s0 ^= gfmul[d0][gfgen[3][d]];
 		}
 		/* first disk with all coefficients at 1 */
 		d0 = v_8(v[0][i]);
 		p0 ^= d0;
 		q0 ^= d0;
 		r0 ^= d0;
 		s0 ^= d0;
 		v_8(p[i]) = p0;
 		v_8(q[i]) = q0;
 		v_8(r[i]) = r0;
 		v_8(s[i]) = s0;
 	}
 }
 /*
 * GEN5 (penta parity with Cauchy matrix) 8bit C implementation
 *
 * Note that instead of a generic multiplication table, likely resulting
 * in multiple cache misses, a precomputed table could be used.
 * But this is only a kind of reference function, and we are not really
 * interested in speed.
 */
 void raid_gen5_int8(int nd, size_t size, void **vv)
 {
 	uint8_t **v = (uint8_t **)vv;
 	uint8_t *p;
 	uint8_t *q;
 	uint8_t *r;
 	uint8_t *s;
 	uint8_t *t;
 	int d, l;
 	size_t i;
 	uint8_t d0, t0, s0, r0, q0, p0;
 	l = nd - 1;
 	p = v[nd];
 	q = v[nd + 1];
 	r = v[nd + 2];
 	s = v[nd + 3];
 	t = v[nd + 4];
 	for (i = 0; i < size; i += 1) {
 		p0 = q0 = r0 = s0 = t0 = 0;
 		for (d = l; d > 0; --d) {
 			d0 = v_8(v[d][i]);
 			p0 ^= d0;
 			q0 ^= gfmul[d0][gfgen[1][d]];
 			r0 ^= gfmul[d0][gfgen[2][d]];
 			s0 ^= gfmul[d0][gfgen[3][d]];
 			t0 ^= gfmul[d0][gfgen[4][d]];
 		}
 		/* first disk with all coefficients at 1 */
 		d0 = v_8(v[0][i]);
 		p0 ^= d0;
 		q0 ^= d0;
 		r0 ^= d0;
 		s0 ^= d0;
 		t0 ^= d0;
 		v_8(p[i]) = p0;
 		v_8(q[i]) = q0;
 		v_8(r[i]) = r0;
 		v_8(s[i]) = s0;
 		v_8(t[i]) = t0;
 	}
 }
 /*
 * GEN6 (hexa parity with Cauchy matrix) 8bit C implementation
 *
 * Note that instead of a generic multiplication table, likely resulting
 * in multiple cache misses, a precomputed table could be used.
 * But this is only a kind of reference function, and we are not really
 * interested in speed.
 */
 void raid_gen6_int8(int nd, size_t size, void **vv)
 {
 	uint8_t **v = (uint8_t **)vv;
 	uint8_t *p;
 	uint8_t *q;
 	uint8_t *r;
 	uint8_t *s;
 	uint8_t *t;
 	uint8_t *u;
 	int d, l;
 	size_t i;
 	uint8_t d0, u0, t0, s0, r0, q0, p0;
 	l = nd - 1;
 	p = v[nd];
 	q = v[nd + 1];
 	r = v[nd + 2];
 	s = v[nd + 3];
 	t = v[nd + 4];
 	u = v[nd + 5];
 	for (i = 0; i < size; i += 1) {
 		p0 = q0 = r0 = s0 = t0 = u0 = 0;
 		for (d = l; d > 0; --d) {
 			d0 = v_8(v[d][i]);
 			p0 ^= d0;
 			q0 ^= gfmul[d0][gfgen[1][d]];
 			r0 ^= gfmul[d0][gfgen[2][d]];
 			s0 ^= gfmul[d0][gfgen[3][d]];
 			t0 ^= gfmul[d0][gfgen[4][d]];
 			u0 ^= gfmul[d0][gfgen[5][d]];
 		}
 		/* first disk with all coefficients at 1 */
 		d0 = v_8(v[0][i]);
 		p0 ^= d0;
 		q0 ^= d0;
 		r0 ^= d0;
 		s0 ^= d0;
 		t0 ^= d0;
 		u0 ^= d0;
 		v_8(p[i]) = p0;
 		v_8(q[i]) = q0;
 		v_8(r[i]) = r0;
 		v_8(s[i]) = s0;
 		v_8(t[i]) = t0;
 		v_8(u[i]) = u0;
 	}
 }
 /*
 * Recover failure of one data block at index id[0] using parity at index
 * ip[0] for any RAID level.
 *
 * Starting from the equation:
 *
 * Pd = A[ip[0],id[0]] * Dx
 *
 * and solving we get:
 *
 * Dx = A[ip[0],id[0]]^-1 * Pd
 */
 void raid_rec1_int8(int nr, int *id, int *ip, int nd, size_t size, void **vv)
 {
 	uint8_t **v = (uint8_t **)vv;
 	uint8_t *p;
 	uint8_t *pa;
 	const uint8_t *T;
 	uint8_t G;
 	uint8_t V;
 	size_t i;
 	(void)nr; /* unused, it's always 1 */
 	/* if it's RAID5 uses the faster function */
 	if (ip[0] == 0) {
 		raid_rec1of1(id, nd, size, vv);
 		return;
 	}
 	/* setup the coefficients matrix */
 	G = A(ip[0], id[0]);
 	/* invert it to solve the system of linear equations */
 	V = inv(G);
 	/* get multiplication tables */
 	T = table(V);
 	/* compute delta parity */
 	raid_delta_gen(1, id, ip, nd, size, vv);
 	p = v[nd + ip[0]];
 	pa = v[id[0]];
 	for (i = 0; i < size; ++i) {
 		/* delta */
 		uint8_t Pd = p[i] ^ pa[i];
 		/* reconstruct */
 		pa[i] = T[Pd];
 	}
 }
 /*
 * Recover failure of two data blocks at indexes id[0],id[1] using parity at
 * indexes ip[0],ip[1] for any RAID level.
 *
 * Starting from the equations:
 *
 * Pd = A[ip[0],id[0]] * Dx + A[ip[0],id[1]] * Dy
 * Qd = A[ip[1],id[0]] * Dx + A[ip[1],id[1]] * Dy
 *
 * we solve inverting the coefficients matrix.
 */
 void raid_rec2_int8(int nr, int *id, int *ip, int nd, size_t size, void **vv)
 {
 	uint8_t **v = (uint8_t **)vv;
 	uint8_t *p;
 	uint8_t *pa;
 	uint8_t *q;
 	uint8_t *qa;
 	const int N = 2;
 	const uint8_t *T[N][N];
 	uint8_t G[N * N];
 	uint8_t V[N * N];
 	size_t i;
 	int j, k;
 	(void)nr; /* unused, it's always 2 */
 	/* if it's RAID6 recovering with P and Q uses the faster function */
 	if (ip[0] == 0 && ip[1] == 1) {
 		raid_rec2of2_int8(id, ip, nd, size, vv);
 		return;
 	}
 	/* setup the coefficients matrix */
 	for (j = 0; j < N; ++j)
 		for (k = 0; k < N; ++k)
 			G[j * N + k] = A(ip[j], id[k]);
 	/* invert it to solve the system of linear equations */
 	raid_invert(G, V, N);
 	/* get multiplication tables */
 	for (j = 0; j < N; ++j)
 		for (k = 0; k < N; ++k)
 			T[j][k] = table(V[j * N + k]);
 	/* compute delta parity */
 	raid_delta_gen(2, id, ip, nd, size, vv);
 	p = v[nd + ip[0]];
 	q = v[nd + ip[1]];
 	pa = v[id[0]];
 	qa = v[id[1]];
 	for (i = 0; i < size; ++i) {
 		/* delta */
 		uint8_t Pd = p[i] ^ pa[i];
 		uint8_t Qd = q[i] ^ qa[i];
 		/* reconstruct */
 		pa[i] = T[0][0][Pd] ^ T[0][1][Qd];
 		qa[i] = T[1][0][Pd] ^ T[1][1][Qd];
 	}
 }
 /*
 * Recover failure of N data blocks at indexes id[N] using parity at indexes
 * ip[N] for any RAID level.
 *
 * Starting from the N equations, with 0<=i<N :
 *
 * PD[i] = sum(A[ip[i],id[j]] * D[i]) 0<=j<N
 *
 * we solve inverting the coefficients matrix.
 *
 * Note that referring at previous equations you have:
 * PD[0] = Pd, PD[1] = Qd, PD[2] = Rd, ...
 * D[0] = Dx, D[1] = Dy, D[2] = Dz, ...
 */
 void raid_recX_int8(int nr, int *id, int *ip, int nd, size_t size, void **vv)
 {
 	uint8_t **v = (uint8_t **)vv;
 	uint8_t *p[RAID_PARITY_MAX];
 	uint8_t *pa[RAID_PARITY_MAX];
 	const uint8_t *T[RAID_PARITY_MAX][RAID_PARITY_MAX];
 	uint8_t G[RAID_PARITY_MAX * RAID_PARITY_MAX];
 	uint8_t V[RAID_PARITY_MAX * RAID_PARITY_MAX];
 	size_t i;
 	int j, k;
 	/* setup the coefficients matrix */
 	for (j = 0; j < nr; ++j)
 		for (k = 0; k < nr; ++k)
 			G[j * nr + k] = A(ip[j], id[k]);
 	/* invert it to solve the system of linear equations */
 	raid_invert(G, V, nr);
 	/* get multiplication tables */
 	for (j = 0; j < nr; ++j)
 		for (k = 0; k < nr; ++k)
 			T[j][k] = table(V[j * nr + k]);
 	/* compute delta parity */
 	raid_delta_gen(nr, id, ip, nd, size, vv);
 	for (j = 0; j < nr; ++j) {
 		p[j] = v[nd + ip[j]];
 		pa[j] = v[id[j]];
 	}
 	for (i = 0; i < size; ++i) {
 		uint8_t PD[RAID_PARITY_MAX];
 		/* delta */
 		for (j = 0; j < nr; ++j)
 			PD[j] = p[j][i] ^ pa[j][i];
 		/* reconstruct */
 		for (j = 0; j < nr; ++j) {
 			uint8_t b = 0;
 			for (k = 0; k < nr; ++k)
 				b ^= T[j][k][PD[k]];
 			pa[j][i] = b;
 		}
 	}
 }
--- a/raid/internal.h
+++ b/raid/internal.h
@ -0,0 +1,274 @@
 /*
 * Copyright (C) 2013 Andrea Mazzoleni
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #ifndef __RAID_INTERNAL_H
 #define __RAID_INTERNAL_H
 /*
 * Supported instruction sets.
 *
 * It may happen that the assembler is too old to support
 * all instructions, even if the architecture supports them.
 * These defines allow to exclude from the build the not supported ones.
 *
 * If in your project you use a predefined assembler, you can define them
 * using fixed values, instead of using the HAVE_* defines.
 */
 #if HAVE_CONFIG_H
 /* Includes the project configuration for HAVE_* defines */
 #include "config.h"
 /* If the compiler supports assembly */
 #if HAVE_ASSEMBLY
 /* Autodetect from the compiler */
 #if defined(__i386__)
 #define CONFIG_X86 1
 #define CONFIG_X86_32 1
 #endif
 #if defined(__x86_64__)
 #define CONFIG_X86 1
 #define CONFIG_X86_64 1
 #endif
 #endif
 /* Enables SSE2, SSSE3, AVX2 only if the assembler supports it */
 #if HAVE_SSE2
 #define CONFIG_SSE2 1
 #endif
 #if HAVE_SSSE3
 #define CONFIG_SSSE3 1
 #endif
 #if HAVE_AVX2
 #define CONFIG_AVX2 1
 #endif
 #else /* if HAVE_CONFIG_H is not defined */
 /* Assume that assembly is always supported */
 #if defined(__i386__)
 #define CONFIG_X86 1
 #define CONFIG_X86_32 1
 #endif
 #if defined(__x86_64__)
 #define CONFIG_X86 1
 #define CONFIG_X86_64 1
 #endif
 /* Assumes that the assembler supports everything */
 #ifdef CONFIG_X86
 #define CONFIG_SSE2 1
 #define CONFIG_SSSE3 1
 #define CONFIG_AVX2 1
 #endif
 #endif
 /*
 * Includes anything required for compatibility.
 */
 #include <assert.h>
 #include <stdint.h>
 #include <stdlib.h>
 #include <string.h>
 /*
 * Inverse assert.
 */
 #define BUG_ON(a) assert(!(a))
 /*
 * Forced inline.
 */
 #ifndef __always_inline
 #define __always_inline inline __attribute__((always_inline))
 #endif
 /*
 * Forced alignment.
 */
 #ifndef __aligned
 #define __aligned(a) __attribute__((aligned(a)))
 #endif
 /*
 * Align a pointer at the specified size.
 */
 static __always_inline void *__align_ptr(void *ptr, uintptr_t size)
 {
 	uintptr_t offset = (uintptr_t)ptr;
 	offset = (offset + size - 1U) & ~(size - 1U);
 	return (void *)offset;
 }
 /*
 * Includes the main interface headers.
 */
 #include "raid.h"
 #include "helper.h"
 /*
 * Internal functions.
 *
 * These are intended to provide access for testing.
 */
 int raid_selftest(void);
 void raid_gen_ref(int nd, int np, size_t size, void **vv);
 void raid_invert(uint8_t *M, uint8_t *V, int n);
 void raid_delta_gen(int nr, int *id, int *ip, int nd, size_t size, void **v);
 void raid_rec1of1(int *id, int nd, size_t size, void **v);
 void raid_rec2of2_int8(int *id, int *ip, int nd, size_t size, void **vv);
 void raid_gen1_int32(int nd, size_t size, void **vv);
 void raid_gen1_int64(int nd, size_t size, void **vv);
 void raid_gen1_sse2(int nd, size_t size, void **vv);
 void raid_gen1_avx2(int nd, size_t size, void **vv);
 void raid_gen2_int32(int nd, size_t size, void **vv);
 void raid_gen2_int64(int nd, size_t size, void **vv);
 void raid_gen2_sse2(int nd, size_t size, void **vv);
 void raid_gen2_avx2(int nd, size_t size, void **vv);
 void raid_gen2_sse2ext(int nd, size_t size, void **vv);
 void raid_genz_int32(int nd, size_t size, void **vv);
 void raid_genz_int64(int nd, size_t size, void **vv);
 void raid_genz_sse2(int nd, size_t size, void **vv);
 void raid_genz_sse2ext(int nd, size_t size, void **vv);
 void raid_genz_avx2ext(int nd, size_t size, void **vv);
 void raid_gen3_int8(int nd, size_t size, void **vv);
 void raid_gen3_ssse3(int nd, size_t size, void **vv);
 void raid_gen3_ssse3ext(int nd, size_t size, void **vv);
 void raid_gen3_avx2ext(int nd, size_t size, void **vv);
 void raid_gen4_int8(int nd, size_t size, void **vv);
 void raid_gen4_ssse3(int nd, size_t size, void **vv);
 void raid_gen4_ssse3ext(int nd, size_t size, void **vv);
 void raid_gen4_avx2ext(int nd, size_t size, void **vv);
 void raid_gen5_int8(int nd, size_t size, void **vv);
 void raid_gen5_ssse3(int nd, size_t size, void **vv);
 void raid_gen5_ssse3ext(int nd, size_t size, void **vv);
 void raid_gen5_avx2ext(int nd, size_t size, void **vv);
 void raid_gen6_int8(int nd, size_t size, void **vv);
 void raid_gen6_ssse3(int nd, size_t size, void **vv);
 void raid_gen6_ssse3ext(int nd, size_t size, void **vv);
 void raid_gen6_avx2ext(int nd, size_t size, void **vv);
 void raid_rec1_int8(int nr, int *id, int *ip, int nd, size_t size, void **vv);
 void raid_rec2_int8(int nr, int *id, int *ip, int nd, size_t size, void **vv);
 void raid_recX_int8(int nr, int *id, int *ip, int nd, size_t size, void **vv);
 void raid_rec1_ssse3(int nr, int *id, int *ip, int nd, size_t size, void **vv);
 void raid_rec2_ssse3(int nr, int *id, int *ip, int nd, size_t size, void **vv);
 void raid_recX_ssse3(int nr, int *id, int *ip, int nd, size_t size, void **vv);
 void raid_rec1_avx2(int nr, int *id, int *ip, int nd, size_t size, void **vv);
 void raid_rec2_avx2(int nr, int *id, int *ip, int nd, size_t size, void **vv);
 void raid_recX_avx2(int nr, int *id, int *ip, int nd, size_t size, void **vv);
 /*
 * Internal naming.
 *
 * These are intented to provide access for testing.
 */
 const char *raid_gen1_tag(void);
 const char *raid_gen2_tag(void);
 const char *raid_genz_tag(void);
 const char *raid_gen3_tag(void);
 const char *raid_gen4_tag(void);
 const char *raid_gen5_tag(void);
 const char *raid_gen6_tag(void);
 const char *raid_rec1_tag(void);
 const char *raid_rec2_tag(void);
 const char *raid_recX_tag(void);
 /*
 * Internal forwarders.
 */
 extern void (*raid_gen3_ptr)(int nd, size_t size, void **vv);
 extern void (*raid_genz_ptr)(int nd, size_t size, void **vv);
 extern void (*raid_gen_ptr[RAID_PARITY_MAX])(
 	int nd, size_t size, void **vv);
 extern void (*raid_rec_ptr[RAID_PARITY_MAX])(
 	int nr, int *id, int *ip, int nd, size_t size, void **vv);
 /*
 * Tables.
 */
 extern const uint8_t raid_gfmul[256][256] __aligned(256);
 extern const uint8_t raid_gfexp[256] __aligned(256);
 extern const uint8_t raid_gfinv[256] __aligned(256);
 extern const uint8_t raid_gfvandermonde[3][256] __aligned(256);
 extern const uint8_t raid_gfcauchy[6][256] __aligned(256);
 extern const uint8_t raid_gfcauchypshufb[251][4][2][16] __aligned(256);
 extern const uint8_t raid_gfmulpshufb[256][2][16] __aligned(256);
 extern const uint8_t (*raid_gfgen)[256];
 #define gfmul raid_gfmul
 #define gfexp raid_gfexp
 #define gfinv raid_gfinv
 #define gfvandermonde raid_gfvandermonde
 #define gfcauchy raid_gfcauchy
 #define gfgenpshufb raid_gfcauchypshufb
 #define gfmulpshufb raid_gfmulpshufb
 #define gfgen raid_gfgen
 /*
 * Assembler blocks.
 */
 #ifdef CONFIG_X86
 #ifdef CONFIG_SSE2
 static __always_inline void raid_sse_begin(void)
 {
 }
 static __always_inline void raid_sse_end(void)
 {
 	/* SSE and AVX code uses non-temporal writes, like MOVNTDQ, */
 	/* that use a weak memory model. To ensure that other processors */
 	/* see correctly the data written, we use a store-store memory */
 	/* barrier at the end of the asm code */
 	asm volatile ("sfence" : : : "memory");
 	/* clobbers registers used in the asm code */
 	/* this is required because in the Windows ABI, */
 	/* registers xmm6-xmm15 should be kept by the callee. */
 	/* this clobber list force the compiler to save any */
 	/* register that needs to be saved */
 	/* we check for __SSE2_ because we require that the */
 	/* compiler supports SSE2 registers in the clobber list */
 #ifdef __SSE2__
 	asm volatile ("" : : : "%xmm0", "%xmm1", "%xmm2", "%xmm3");
 	asm volatile ("" : : : "%xmm4", "%xmm5", "%xmm6", "%xmm7");
 #ifdef CONFIG_X86_64
 	asm volatile ("" : : : "%xmm8", "%xmm9", "%xmm10", "%xmm11");
 	asm volatile ("" : : : "%xmm12", "%xmm13", "%xmm14", "%xmm15");
 #endif
 #endif
 }
 #endif
 #ifdef CONFIG_AVX2
 static __always_inline void raid_avx_begin(void)
 {
 	raid_sse_begin();
 }
 static __always_inline void raid_avx_end(void)
 {
 	raid_sse_end();
 	/* reset the upper part of the ymm registers */
 	/* to avoid the 70 clocks penality on the next */
 	/* xmm register use */
 	asm volatile ("vzeroupper" : : : "memory");
 }
 #endif
 #endif /* CONFIG_X86 */
 #endif
--- a/raid/intz.c
+++ b/raid/intz.c
@ -0,0 +1,119 @@
 /*
 * Copyright (C) 2013 Andrea Mazzoleni
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #include "internal.h"
 #include "gf.h"
 /*
 * GENz (triple parity with powers of 2^-1) 32bit C implementation
 */
 void raid_genz_int32(int nd, size_t size, void **vv)
 {
 	uint8_t **v = (uint8_t**)vv;
 	uint8_t *p;
 	uint8_t *q;
 	uint8_t *r;
 	int d, l;
 	size_t i;
 	uint32_t d0, r0, q0, p0;
 	uint32_t d1, r1, q1, p1;
 	l = nd - 1;
 	p = v[nd];
 	q = v[nd + 1];
 	r = v[nd + 2];
 	for (i = 0; i < size; i += 8) {
 		r0 = q0 = p0 = v_32(v[l][i]);
 		r1 = q1 = p1 = v_32(v[l][i + 4]);
 		for (d = l - 1; d >= 0; --d) {
 			d0 = v_32(v[d][i]);
 			d1 = v_32(v[d][i + 4]);
 			p0 ^= d0;
 			p1 ^= d1;
 			q0 = x2_32(q0);
 			q1 = x2_32(q1);
 			q0 ^= d0;
 			q1 ^= d1;
 			r0 = d2_32(r0);
 			r1 = d2_32(r1);
 			r0 ^= d0;
 			r1 ^= d1;
 		}
 		v_32(p[i]) = p0;
 		v_32(p[i + 4]) = p1;
 		v_32(q[i]) = q0;
 		v_32(q[i + 4]) = q1;
 		v_32(r[i]) = r0;
 		v_32(r[i + 4]) = r1;
 	}
 }
 /*
 * GENz (triple parity with powers of 2^-1) 64bit C implementation
 */
 void raid_genz_int64(int nd, size_t size, void **vv)
 {
 	uint8_t **v = (uint8_t**)vv;
 	uint8_t *p;
 	uint8_t *q;
 	uint8_t *r;
 	int d, l;
 	size_t i;
 	uint64_t d0, r0, q0, p0;
 	uint64_t d1, r1, q1, p1;
 	l = nd - 1;
 	p = v[nd];
 	q = v[nd + 1];
 	r = v[nd + 2];
 	for (i = 0; i < size; i += 16) {
 		r0 = q0 = p0 = v_64(v[l][i]);
 		r1 = q1 = p1 = v_64(v[l][i + 8]);
 		for (d = l - 1; d >= 0; --d) {
 			d0 = v_64(v[d][i]);
 			d1 = v_64(v[d][i + 8]);
 			p0 ^= d0;
 			p1 ^= d1;
 			q0 = x2_64(q0);
 			q1 = x2_64(q1);
 			q0 ^= d0;
 			q1 ^= d1;
 			r0 = d2_64(r0);
 			r1 = d2_64(r1);
 			r0 ^= d0;
 			r1 ^= d1;
 		}
 		v_64(p[i]) = p0;
 		v_64(p[i + 8]) = p1;
 		v_64(q[i]) = q0;
 		v_64(q[i + 8]) = q1;
 		v_64(r[i]) = r0;
 		v_64(r[i + 8]) = r1;
 	}
 }
--- a/raid/memory.c
+++ b/raid/memory.c
@ -0,0 +1,154 @@
 /*
 * Copyright (C) 2013 Andrea Mazzoleni
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #include "internal.h"
 #include "memory.h"
 void *raid_malloc_align(size_t size, size_t align_size, void **freeptr)
 {
 	unsigned char *ptr;
 	uintptr_t offset;
 	ptr = malloc(size + align_size);
 	if (!ptr) {
 		/* LCOV_EXCL_START */
 		return 0;
 		/* LCOV_EXCL_STOP */
 	}
 	*freeptr = ptr;
 	offset = ((uintptr_t)ptr) % align_size;
 	if (offset != 0)
 		ptr += align_size - offset;
 	return ptr;
 }
 void *raid_malloc(size_t size, void **freeptr)
 {
    return raid_malloc_align(size, RAID_MALLOC_ALIGN, freeptr);
 }
 void **raid_malloc_vector_align(int nd, int n, size_t size, size_t align_size, size_t displacement_size, void **freeptr)
 {
 	void **v;
 	unsigned char *va;
 	int i;
 	BUG_ON(n <= 0 || nd < 0);
 	v = malloc(n * sizeof(void *));
 	if (!v) {
 		/* LCOV_EXCL_START */
 		return 0;
 		/* LCOV_EXCL_STOP */
 	}
 	va = raid_malloc_align(n * (size + displacement_size), align_size, freeptr);
 	if (!va) {
 		/* LCOV_EXCL_START */
 		free(v);
 		return 0;
 		/* LCOV_EXCL_STOP */
 	}
 	for (i = 0; i < n; ++i) {
 		v[i] = va;
 		va += size + displacement_size;
 	}
 	/* reverse order of the data blocks */
 	/* because they are usually accessed from the last one */
 	for (i = 0; i < nd / 2; ++i) {
 		void *ptr = v[i];
 		v[i] = v[nd - 1 - i];
 		v[nd - 1 - i] = ptr;
 	}
 	return v;
 }
 void **raid_malloc_vector(int nd, int n, size_t size, void **freeptr)
 {
    return raid_malloc_vector_align(nd, n, size, RAID_MALLOC_ALIGN, RAID_MALLOC_DISPLACEMENT, freeptr);
 }
 void raid_mrand_vector(unsigned seed, int n, size_t size, void **vv)
 {
 	unsigned char **v = (unsigned char **)vv;
 	int i;
 	size_t j;
 	for (i = 0; i < n; ++i)
 		for (j = 0; j < size; ++j) {
 			/* basic C99/C11 linear congruential generator */
 			seed = seed * 1103515245U + 12345U;
 			v[i][j] = seed >> 16;
 		}
 }
 int raid_mtest_vector(int n, size_t size, void **vv)
 {
 	unsigned char **v = (unsigned char **)vv;
 	int i;
 	size_t j;
 	unsigned k;
 	unsigned char d;
 	unsigned char p;
 	/* fill with 0 */
 	d = 0;
 	for (i = 0; i < n; ++i)
 		for (j = 0; j < size; ++j)
 			v[i][j] = d;
 	/* test with all the byte patterns */
 	for (k = 1; k < 256; ++k) {
 		p = d;
 		d = k;
 		/* forward fill */
 		for (i = 0; i < n; ++i) {
 			for (j = 0; j < size; ++j) {
 				if (v[i][j] != p) {
 					/* LCOV_EXCL_START */
 					return -1;
 					/* LCOV_EXCL_STOP */
 				}
 				v[i][j] = d;
 			}
 		}
 		p = d;
 		d = ~p;
 		/* backward fill with complement */
 		for (i = 0; i < n; ++i) {
 			for (j = size; j > 0; --j) {
 				if (v[i][j - 1] != p) {
 					/* LCOV_EXCL_START */
 					return -1;
 					/* LCOV_EXCL_STOP */
 				}
 				v[i][j - 1] = d;
 			}
 		}
 	}
 	return 0;
 }
--- a/raid/memory.h
+++ b/raid/memory.h
@ -0,0 +1,96 @@
 /*
 * Copyright (C) 2013 Andrea Mazzoleni
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #ifndef __RAID_MEMORY_H
 #define __RAID_MEMORY_H
 /**
 * Memory alignment provided by raid_malloc().
 *
 * It should guarantee good cache performance everywhere.
 */
 #define RAID_MALLOC_ALIGN 256
 /**
 * Memory displacement to avoid cache address sharing on contiguous blocks,
 * used by raid_malloc_vector().
 *
 * When allocating a sequence of blocks with a size of power of 2,
 * there is the risk that the addresses of each block are mapped into the
 * same cache line and prefetching predictor, resulting in a lot of cache
 * sharing if you access all the blocks in parallel, from the start to the
 * end.
 *
 * To avoid this effect, it's better if all the blocks are allocated
 * with a fixed displacement trying to reduce the cache addresses sharing.
 *
 * The selected displacement was chosen empirically with some speed tests
 * with 8/12/16/20/24 data buffers of 256 KB.
 *
 * These are the results in MB/s with no displacement:
 *
 *            sse2
 *    gen1   15368 [MB/s]
 *    gen2    6814 [MB/s]
 *    genz    3033 [MB/s]
 *
 * These are the results with displacement resulting in improvments
 * in the order of 20% or more:
 *
 *            sse2
 *    gen1   21936 [MB/s]
 *    gen2   11902 [MB/s]
 *    genz    5838 [MB/s]
 *
 */
 #define RAID_MALLOC_DISPLACEMENT (7*256)
 /**
 * Aligned malloc.
 * Use an alignment suitable for the raid functions.
 */
 void *raid_malloc(size_t size, void **freeptr);
 /**
 * Arbitrary aligned malloc.
 */
 void *raid_malloc_align(size_t size, size_t align_size, void **freeptr);
 /**
 * Aligned vector allocation.
 * Use an alignment suitable for the raid functions.
 * Returns a vector of @n pointers, each one pointing to a block of
 * the specified @size.
 * The first @nd elements are reversed in order.
 */
 void **raid_malloc_vector(int nd, int n, size_t size, void **freeptr);
 /**
 * Arbitrary aligned vector allocation.
 */
 void **raid_malloc_vector_align(int nd, int n, size_t size, size_t align_size, size_t displacement_size, void **freeptr);
 /**
 * Fills the memory vector with pseudo-random data based on the specified seed.
 */
 void raid_mrand_vector(unsigned seed, int n, size_t size, void **vv);
 /**
 * Tests the memory vector for RAM problems.
 * If a problem is found, it crashes.
 */
 int raid_mtest_vector(int n, size_t size, void **vv);
 #endif
--- a/raid/module.c
+++ b/raid/module.c
@ -0,0 +1,473 @@
 /*
 * Copyright (C) 2013 Andrea Mazzoleni
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #include "internal.h"
 #include "memory.h"
 #include "cpu.h"
 /*
 * Initializes and selects the best algorithm.
 */
 void raid_init(void)
 {
 	raid_gen3_ptr = raid_gen3_int8;
 	raid_gen_ptr[3] = raid_gen4_int8;
 	raid_gen_ptr[4] = raid_gen5_int8;
 	raid_gen_ptr[5] = raid_gen6_int8;
 	if (sizeof(void *) == 4) {
 		raid_gen_ptr[0] = raid_gen1_int32;
 		raid_gen_ptr[1] = raid_gen2_int32;
 		raid_genz_ptr = raid_genz_int32;
 	} else {
 		raid_gen_ptr[0] = raid_gen1_int64;
 		raid_gen_ptr[1] = raid_gen2_int64;
 		raid_genz_ptr = raid_genz_int64;
 	}
 	raid_rec_ptr[0] = raid_rec1_int8;
 	raid_rec_ptr[1] = raid_rec2_int8;
 	raid_rec_ptr[2] = raid_recX_int8;
 	raid_rec_ptr[3] = raid_recX_int8;
 	raid_rec_ptr[4] = raid_recX_int8;
 	raid_rec_ptr[5] = raid_recX_int8;
 #ifdef CONFIG_X86
 #ifdef CONFIG_SSE2
 	if (raid_cpu_has_sse2()) {
 		raid_gen_ptr[0] = raid_gen1_sse2;
 #ifdef CONFIG_X86_64
 		if (raid_cpu_has_slowextendedreg()) {
 			raid_gen_ptr[1] = raid_gen2_sse2;
 		} else {
 			raid_gen_ptr[1] = raid_gen2_sse2ext;
 		}
 		/* note that raid_cpu_has_slowextendedreg() doesn't affect parz */
 		raid_genz_ptr = raid_genz_sse2ext;
 #else
 		raid_gen_ptr[1] = raid_gen2_sse2;
 		raid_genz_ptr = raid_genz_sse2;
 #endif
 	}
 #endif
 #ifdef CONFIG_SSSE3
 	if (raid_cpu_has_ssse3()) {
 #ifdef CONFIG_X86_64
 		if (raid_cpu_has_slowextendedreg()) {
 			raid_gen3_ptr = raid_gen3_ssse3;
 			raid_gen_ptr[3] = raid_gen4_ssse3;
 			raid_gen_ptr[4] = raid_gen5_ssse3;
 			raid_gen_ptr[5] = raid_gen6_ssse3;
 		} else {
 			raid_gen3_ptr = raid_gen3_ssse3ext;
 			raid_gen_ptr[3] = raid_gen4_ssse3ext;
 			raid_gen_ptr[4] = raid_gen5_ssse3ext;
 			raid_gen_ptr[5] = raid_gen6_ssse3ext;
 		}
 #else
 		raid_gen3_ptr = raid_gen3_ssse3;
 		raid_gen_ptr[3] = raid_gen4_ssse3;
 		raid_gen_ptr[4] = raid_gen5_ssse3;
 		raid_gen_ptr[5] = raid_gen6_ssse3;
 #endif
 		raid_rec_ptr[0] = raid_rec1_ssse3;
 		raid_rec_ptr[1] = raid_rec2_ssse3;
 		raid_rec_ptr[2] = raid_recX_ssse3;
 		raid_rec_ptr[3] = raid_recX_ssse3;
 		raid_rec_ptr[4] = raid_recX_ssse3;
 		raid_rec_ptr[5] = raid_recX_ssse3;
 	}
 #endif
 #ifdef CONFIG_AVX2
 	if (raid_cpu_has_avx2()) {
 		raid_gen_ptr[0] = raid_gen1_avx2;
 		raid_gen_ptr[1] = raid_gen2_avx2;
 #ifdef CONFIG_X86_64
 		raid_gen3_ptr = raid_gen3_avx2ext;
 		raid_genz_ptr = raid_genz_avx2ext;
 		raid_gen_ptr[3] = raid_gen4_avx2ext;
 		raid_gen_ptr[4] = raid_gen5_avx2ext;
 		raid_gen_ptr[5] = raid_gen6_avx2ext;
 #endif
 		raid_rec_ptr[0] = raid_rec1_avx2;
 		raid_rec_ptr[1] = raid_rec2_avx2;
 		raid_rec_ptr[2] = raid_recX_avx2;
 		raid_rec_ptr[3] = raid_recX_avx2;
 		raid_rec_ptr[4] = raid_recX_avx2;
 		raid_rec_ptr[5] = raid_recX_avx2;
 	}
 #endif
 #endif /* CONFIG_X86 */
 	/* set the default mode */
 	raid_mode(RAID_MODE_CAUCHY);
 }
 /*
 * Reference parity computation.
 */
 void raid_gen_ref(int nd, int np, size_t size, void **vv)
 {
 	uint8_t **v = (uint8_t **)vv;
 	size_t i;
 	for (i = 0; i < size; ++i) {
 		uint8_t p[RAID_PARITY_MAX];
 		int j, d;
 		for (j = 0; j < np; ++j)
 			p[j] = 0;
 		for (d = 0; d < nd; ++d) {
 			uint8_t b = v[d][i];
 			for (j = 0; j < np; ++j)
 				p[j] ^= gfmul[b][gfgen[j][d]];
 		}
 		for (j = 0; j < np; ++j)
 			v[nd + j][i] = p[j];
 	}
 }
 /*
 * Size of the blocks to test.
 */
 #define TEST_SIZE 4096
 /*
 * Number of data blocks to test.
 */
 #define TEST_COUNT (65536 / TEST_SIZE)
 /*
 * Parity generation test.
 */
 static int raid_test_par(int nd, int np, size_t size, void **v, void **ref)
 {
 	int i;
 	void *t[TEST_COUNT + RAID_PARITY_MAX];
 	/* setup data */
 	for (i = 0; i < nd; ++i)
 		t[i] = ref[i];
 	/* setup parity */
 	for (i = 0; i < np; ++i)
 		t[nd + i] = v[nd + i];
 	raid_gen(nd, np, size, t);
 	/* compare parity */
 	for (i = 0; i < np; ++i) {
 		if (memcmp(t[nd + i], ref[nd + i], size) != 0) {
 			/* LCOV_EXCL_START */
 			return -1;
 			/* LCOV_EXCL_STOP */
 		}
 	}
 	return 0;
 }
 /*
 * Recovering test.
 */
 static int raid_test_rec(int nr, int *ir, int nd, int np, size_t size, void **v, void **ref)
 {
 	int i, j;
 	void *t[TEST_COUNT + RAID_PARITY_MAX];
 	/* setup data and parity vector */
 	for (i = 0, j = 0; i < nd + np; ++i) {
 		if (j < nr && ir[j] == i) {
 			/* this block has to be recovered */
 			t[i] = v[i];
 			++j;
 		} else {
 			/* this block is used for recovering */
 			t[i] = ref[i];
 		}
 	}
 	raid_rec(nr, ir, nd, np, size, t);
 	/* compare all data and parity */
 	for (i = 0; i < nd + np; ++i) {
 		if (t[i] != ref[i]
 			&& memcmp(t[i], ref[i], size) != 0) {
 			/* LCOV_EXCL_START */
 			return -1;
 			/* LCOV_EXCL_STOP */
 		}
 	}
 	return 0;
 }
 /*
 * Recovering test for data.
 */
 static int raid_test_data(int nr, int *id, int *ip, int nd, int np, size_t size, void **v, void **ref)
 {
 	int i, j;
 	void *t[TEST_COUNT + RAID_PARITY_MAX];
 	/* setup data vector */
 	for (i = 0, j = 0; i < nd; ++i) {
 		if (j < nr && id[j] == i) {
 			/* this block has to be recovered */
 			t[i] = v[i];
 			++j;
 		} else {
 			/* this block is left unchanged */
 			t[i] = ref[i];
 		}
 	}
 	/* setup parity vector */
 	for (i = 0, j = 0; i < np; ++i) {
 		if (j < nr && ip[j] == i) {
 			/* this block is used for recovering */
 			t[nd + i] = ref[nd + i];
 			++j;
 		} else {
 			/* this block should not be read or written */
 			t[nd + i] = 0;
 		}
 	}
 	raid_data(nr, id, ip, nd, size, t);
 	/* compare all data and parity */
 	for (i = 0; i < nd; ++i) {
 		if (t[i] != ref[i]
 			&& t[i] != 0
 			&& memcmp(t[i], ref[i], size) != 0) {
 			/* LCOV_EXCL_START */
 			return -1;
 			/* LCOV_EXCL_STOP */
 		}
 	}
 	return 0;
 }
 /*
 * Scan test.
 */
 static int raid_test_scan(int nr, int *ir, int nd, int np, size_t size, void **v, void **ref)
 {
 	int i, j, ret;
 	void *t[TEST_COUNT + RAID_PARITY_MAX];
 	int is[RAID_PARITY_MAX];
 	/* setup data and parity vector */
 	for (i = 0, j = 0; i < nd + np; ++i) {
 		if (j < nr && ir[j] == i) {
 			/* this block is bad */
 			t[i] = v[i];
 			++j;
 		} else {
 			/* this block is used for recovering */
 			t[i] = ref[i];
 		}
 	}
 	ret = raid_scan(is, nd, np, size, t);
 	/* compare identified bad blocks */
 	if (ret != nr)
 		return -1;
 	for (i = 0; i < nr; ++i) {
 		if (ir[i] != is[i]) {
 			/* LCOV_EXCL_START */
 			return -1;
 			/* LCOV_EXCL_STOP */
 		}
 	}
 	return 0;
 }
 /*
 * Basic functionality self test.
 */
 int raid_selftest(void)
 {
 	const int nd = TEST_COUNT;
 	const size_t size = TEST_SIZE;
 	const int nv = nd + RAID_PARITY_MAX * 2 + 1;
 	void *v_alloc;
 	void **v;
 	void *ref[nd + RAID_PARITY_MAX];
 	int ir[RAID_PARITY_MAX];
 	int ip[RAID_PARITY_MAX];
 	int i, np;
 	int ret = 0;
 	/* ensure to have enough space for data */
 	BUG_ON(nd * size > 65536);
 	v = raid_malloc_vector(nd, nv, size, &v_alloc);
 	if (!v) {
 		/* LCOV_EXCL_START */
 		return -1;
 		/* LCOV_EXCL_STOP */
 	}
 	memset(v[nv - 1], 0, size);
 	raid_zero(v[nv - 1]);
 	/* use the multiplication table as data */
 	for (i = 0; i < nd; ++i)
 		ref[i] = ((uint8_t *)gfmul) + size * i;
 	/* setup reference parity */
 	for (i = 0; i < RAID_PARITY_MAX; ++i)
 		ref[nd + i] = v[nd + RAID_PARITY_MAX + i];
 	/* compute reference parity */
 	raid_gen_ref(nd, RAID_PARITY_MAX, size, ref);
 	/* test for each parity level */
 	for (np = 1; np <= RAID_PARITY_MAX; ++np) {
 		/* test parity generation */
 		ret = raid_test_par(nd, np, size, v, ref);
 		if (ret != 0) {
 			/* LCOV_EXCL_START */
 			goto bail;
 			/* LCOV_EXCL_STOP */
 		}
 		/* test recovering with broken ending data disks */
 		for (i = 0; i < np; ++i) {
 			/* bad data */
 			ir[i] = nd - np + i;
 			/* good parity */
 			ip[i] = i;
 		}
 		ret = raid_test_rec(np, ir, nd, np, size, v, ref);
 		if (ret != 0) {
 			/* LCOV_EXCL_START */
 			goto bail;
 			/* LCOV_EXCL_STOP */
 		}
 		ret = raid_test_data(np, ir, ip, nd, np, size, v, ref);
 		if (ret != 0) {
 			/* LCOV_EXCL_START */
 			goto bail;
 			/* LCOV_EXCL_STOP */
 		}
 		/* test recovering with broken leading data and broken leading parity */
 		for (i = 0; i < np / 2; ++i) {
 			/* bad data */
 			ir[i] = i;
 			/* good parity */
 			ip[i] = (np + 1) / 2 + i;
 		}
 		/* bad parity */
 		for (i = 0; i < (np + 1) / 2; ++i)
 			ir[np / 2 + i] = nd + i;
 		ret = raid_test_rec(np, ir, nd, np, size, v, ref);
 		if (ret != 0) {
 			/* LCOV_EXCL_START */
 			goto bail;
 			/* LCOV_EXCL_STOP */
 		}
 		ret = raid_test_data(np / 2, ir, ip, nd, np, size, v, ref);
 		if (ret != 0) {
 			/* LCOV_EXCL_START */
 			goto bail;
 			/* LCOV_EXCL_STOP */
 		}
 		/* test recovering with broken leading data and broken ending parity */
 		for (i = 0; i < np / 2; ++i) {
 			/* bad data */
 			ir[i] = i;
 			/* good parity */
 			ip[i] = i;
 		}
 		/* bad parity */
 		for (i = 0; i < (np + 1) / 2; ++i)
 			ir[np / 2 + i] = nd + np - (np + 1) / 2 + i;
 		ret = raid_test_rec(np, ir, nd, np, size, v, ref);
 		if (ret != 0) {
 			/* LCOV_EXCL_START */
 			goto bail;
 			/* LCOV_EXCL_STOP */
 		}
 		ret = raid_test_data(np / 2, ir, ip, nd, np, size, v, ref);
 		if (ret != 0) {
 			/* LCOV_EXCL_START */
 			goto bail;
 			/* LCOV_EXCL_STOP */
 		}
 		/* scan test with broken data and parity */
 		for (i = 0; i < np / 2; ++i) {
 			/* bad data */
 			ir[i] = i;
 		}
 		for (i = 0; i < (np - 1) / 2; ++i) {
 			/* bad parity */
 			ir[np / 2 + i] = nd + i;
 		}
 		for (i = 0; i < np - 1; ++i) {
 			/* make blocks bad */
 			/* we cannot fill them with 0, because the original */
 			/* data may be already filled with 0 */
 			memset(v[ir[i]], 0x55, size);
 		}
 		ret = raid_test_scan(np - 1, ir, nd, np, size, v, ref);
 		if (ret != 0) {
 			/* LCOV_EXCL_START */
 			goto bail;
 			/* LCOV_EXCL_STOP */
 		}
 	}
 	/* scan test with no parity */
 	ret = raid_test_scan(0, 0, nd, 0, size, v, ref);
 	if (ret != -1) {
 		/* LCOV_EXCL_START */
 		goto bail;
 		/* LCOV_EXCL_STOP */
 	}
 	ret = 0;
 bail:
 	free(v);
 	free(v_alloc);
 	return ret;
 }
--- a/raid/raid.c
+++ b/raid/raid.c
@ -0,0 +1,586 @@
 /*
 * Copyright (C) 2013 Andrea Mazzoleni
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #include "internal.h"
 #include "gf.h"
 /*
 * This is a RAID implementation working in the Galois Field GF(2^8) with
 * the primitive polynomial x^8 + x^4 + x^3 + x^2 + 1 (285 decimal), and
 * supporting up to six parity levels.
 *
 * For RAID5 and RAID6 it works as as described in the H. Peter Anvin's
 * paper "The mathematics of RAID-6" [1]. Please refer to this paper for a
 * complete explanation.
 *
 * To support triple parity, it was first evaluated and then dropped, an
 * extension of the same approach, with additional parity coefficients set
 * as powers of 2^-1, with equations:
 *
 * P = sum(Di)
 * Q = sum(2^i * Di)
 * R = sum(2^-i * Di) with 0<=i<N
 *
 * This approach works well for triple parity and it's very efficient,
 * because we can implement very fast parallel multiplications and
 * divisions by 2 in GF(2^8).
 *
 * It's also similar at the approach used by ZFS RAIDZ3, with the
 * difference that ZFS uses powers of 4 instead of 2^-1.
 *
 * Unfortunately it doesn't work beyond triple parity, because whatever
 * value we choose to generate the power coefficients to compute other
 * parities, the resulting equations are not solvable for some
 * combinations of missing disks.
 *
 * This is expected, because the Vandermonde matrix used to compute the
 * parity has no guarantee to have all submatrices not singular
 * [2, Chap 11, Problem 7] and this is a requirement to have
 * a MDS (Maximum Distance Separable) code [2, Chap 11, Theorem 8].
 *
 * To overcome this limitation, we use a Cauchy matrix [3][4] to compute
 * the parity. A Cauchy matrix has the property to have all the square
 * submatrices not singular, resulting in always solvable equations,
 * for any combination of missing disks.
 *
 * The problem of this approach is that it requires the use of
 * generic multiplications, and not only by 2 or 2^-1, potentially
 * affecting badly the performance.
 *
 * Hopefully there is a method to implement parallel multiplications
 * using SSSE3 or AVX2 instructions [1][5]. Method competitive with the
 * computation of triple parity using power coefficients.
 *
 * Another important property of the Cauchy matrix is that we can setup
 * the first two rows with coeffients equal at the RAID5 and RAID6 approach
 * decribed, resulting in a compatible extension, and requiring SSSE3
 * or AVX2 instructions only if triple parity or beyond is used.
 *
 * The matrix is also adjusted, multipling each row by a constant factor
 * to make the first column of all 1, to optimize the computation for
 * the first disk.
 *
 * This results in the matrix A[row,col] defined as:
 *
 * 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01...
 * 01 02 04 08 10 20 40 80 1d 3a 74 e8 cd 87 13 26 4c 98 2d 5a b4 75...
 * 01 f5 d2 c4 9a 71 f1 7f fc 87 c1 c6 19 2f 40 55 3d ba 53 04 9c 61...
 * 01 bb a6 d7 c7 07 ce 82 4a 2f a5 9b b6 60 f1 ad e7 f4 06 d2 df 2e...
 * 01 97 7f 9c 7c 18 bd a2 58 1a da 74 70 a3 e5 47 29 07 f5 80 23 e9...
 * 01 2b 3f cf 73 2c d6 ed cb 74 15 78 8a c1 17 c9 89 68 21 ab 76 3b...
 *
 * This matrix supports 6 level of parity, one for each row, for up to 251
 * data disks, one for each column, with all the 377,342,351,231 square
 * submatrices not singular, verified also with brute-force.
 *
 * This matrix can be extended to support any number of parities, just
 * adding additional rows, and removing one column for each new row.
 * (see mktables.c for more details in how the matrix is generated)
 *
 * In details, parity is computed as:
 *
 * P = sum(Di)
 * Q = sum(2^i *  Di)
 * R = sum(A[2,i] * Di)
 * S = sum(A[3,i] * Di)
 * T = sum(A[4,i] * Di)
 * U = sum(A[5,i] * Di) with 0<=i<N
 *
 * To recover from a failure of six disks at indexes x,y,z,h,v,w,
 * with 0<=x<y<z<h<v<w<N, we compute the parity of the available N-6
 * disks as:
 *
 * Pa = sum(Di)
 * Qa = sum(2^i * Di)
 * Ra = sum(A[2,i] * Di)
 * Sa = sum(A[3,i] * Di)
 * Ta = sum(A[4,i] * Di)
 * Ua = sum(A[5,i] * Di) with 0<=i<N,i!=x,i!=y,i!=z,i!=h,i!=v,i!=w.
 *
 * And if we define:
 *
 * Pd = Pa + P
 * Qd = Qa + Q
 * Rd = Ra + R
 * Sd = Sa + S
 * Td = Ta + T
 * Ud = Ua + U
 *
 * we can sum these two sets of equations, obtaining:
 *
 * Pd =          Dx +          Dy +          Dz +          Dh +          Dv +          Dw
 * Qd =    2^x * Dx +    2^y * Dy +    2^z * Dz +    2^h * Dh +    2^v * Dv +    2^w * Dw
 * Rd = A[2,x] * Dx + A[2,y] * Dy + A[2,z] * Dz + A[2,h] * Dh + A[2,v] * Dv + A[2,w] * Dw
 * Sd = A[3,x] * Dx + A[3,y] * Dy + A[3,z] * Dz + A[3,h] * Dh + A[3,v] * Dv + A[3,w] * Dw
 * Td = A[4,x] * Dx + A[4,y] * Dy + A[4,z] * Dz + A[4,h] * Dh + A[4,v] * Dv + A[4,w] * Dw
 * Ud = A[5,x] * Dx + A[5,y] * Dy + A[5,z] * Dz + A[5,h] * Dh + A[5,v] * Dv + A[5,w] * Dw
 *
 * A linear system always solvable because the coefficients matrix is
 * always not singular due the properties of the matrix A[].
 *
 * Resulting speed in x64, with 8 data disks, using a stripe of 256 KiB,
 * for a Core i5-4670K Haswell Quad-Core 3.4GHz is:
 *
 *             int8   int32   int64    sse2   ssse3    avx2
 *   gen1             13339   25438   45438           50588
 *   gen2              4115    6514   21840           32201
 *   gen3       814                           10154   18613
 *   gen4       620                            7569   14229
 *   gen5       496                            5149   10051
 *   gen6       413                            4239    8190
 *
 * Values are in MiB/s of data processed by a single thread, not counting
 * generated parity.
 *
 * You can replicate these results in your machine using the
 * "raid/test/speedtest.c" program.
 *
 * For comparison, the triple parity computation using the power
 * coeffients "1,2,2^-1" is only a little faster than the one based on
 * the Cauchy matrix if SSSE3 or AVX2 is present.
 *
 *             int8   int32   int64    sse2   ssse3    avx2
 *   genz              2337    2874   10920           18944
 *
 * In conclusion, the use of power coefficients, and specifically powers
 * of 1,2,2^-1, is the best option to implement triple parity in CPUs
 * without SSSE3 and AVX2.
 * But if a modern CPU with SSSE3 or AVX2 is available, the Cauchy
 * matrix is the best option because it provides a fast and general
 * approach working for any number of parities.
 *
 * References:
 * [1] Anvin, "The mathematics of RAID-6", 2004
 * [2] MacWilliams, Sloane, "The Theory of Error-Correcting Codes", 1977
 * [3] Blomer, "An XOR-Based Erasure-Resilient Coding Scheme", 1995
 * [4] Roth, "Introduction to Coding Theory", 2006
 * [5] Plank, "Screaming Fast Galois Field Arithmetic Using Intel SIMD Instructions", 2013
 */
 /**
 * Generator matrix currently used.
 */
 const uint8_t (*raid_gfgen)[256];
 void raid_mode(int mode)
 {
 	if (mode == RAID_MODE_VANDERMONDE) {
 		raid_gen_ptr[2] = raid_genz_ptr;
 		raid_gfgen = gfvandermonde;
 	} else {
 		raid_gen_ptr[2] = raid_gen3_ptr;
 		raid_gfgen = gfcauchy;
 	}
 }
 /**
 * Buffer filled with 0 used in recovering.
 */
 static void *raid_zero_block;
 void raid_zero(void *zero)
 {
 	raid_zero_block = zero;
 }
 /*
 * Forwarders for parity computation.
 *
 * These functions compute the parity blocks from the provided data.
 *
 * The number of parities to compute is implicit in the position in the
 * forwarder vector. Position at index #i, computes (#i+1) parities.
 *
 * All these functions give the guarantee that parities are written
 * in order. First parity P, then parity Q, and so on.
 * This allows to specify the same memory buffer for multiple parities
 * knowning that you'll get the latest written one.
 * This characteristic is used by the raid_delta_gen() function to
 * avoid to damage unused parities in recovering.
 *
 * @nd Number of data blocks
 * @size Size of the blocks pointed by @v. It must be a multipler of 64.
 * @v Vector of pointers to the blocks of data and parity.
 *   It has (@nd + #parities) elements. The starting elements are the blocks
 *   for data, following with the parity blocks.
 *   Each block has @size bytes.
 */
 void (*raid_gen_ptr[RAID_PARITY_MAX])(int nd, size_t size, void **vv);
 void (*raid_gen3_ptr)(int nd, size_t size, void **vv);
 void (*raid_genz_ptr)(int nd, size_t size, void **vv);
 void raid_gen(int nd, int np, size_t size, void **v)
 {
 	/* enforce limit on size */
 	BUG_ON(size % 64 != 0);
 	/* enforce limit on number of failures */
 	BUG_ON(np < 1);
 	BUG_ON(np > RAID_PARITY_MAX);
 	raid_gen_ptr[np - 1](nd, size, v);
 }
 /**
 * Inverts the square matrix M of size nxn into V.
 *
 * This is not a general matrix inversion because we assume the matrix M
 * to have all the square submatrix not singular.
 * We use Gauss elimination to invert.
 *
 * @M Matrix to invert with @n rows and @n columns.
 * @V Destination matrix where the result is put.
 * @n Number of rows and columns of the matrix.
 */
 void raid_invert(uint8_t *M, uint8_t *V, int n)
 {
 	int i, j, k;
 	/* set the identity matrix in V */
 	for (i = 0; i < n; ++i)
 		for (j = 0; j < n; ++j)
 			V[i * n + j] = i == j;
 	/* for each element in the diagonal */
 	for (k = 0; k < n; ++k) {
 		uint8_t f;
 		/* the diagonal element cannot be 0 because */
 		/* we are inverting matrices with all the square */
 		/* submatrices not singular */
 		BUG_ON(M[k * n + k] == 0);
 		/* make the diagonal element to be 1 */
 		f = inv(M[k * n + k]);
 		for (j = 0; j < n; ++j) {
 			M[k * n + j] = mul(f, M[k * n + j]);
 			V[k * n + j] = mul(f, V[k * n + j]);
 		}
 		/* make all the elements over and under the diagonal */
 		/* to be zero */
 		for (i = 0; i < n; ++i) {
 			if (i == k)
 				continue;
 			f = M[i * n + k];
 			for (j = 0; j < n; ++j) {
 				M[i * n + j] ^= mul(f, M[k * n + j]);
 				V[i * n + j] ^= mul(f, V[k * n + j]);
 			}
 		}
 	}
 }
 /**
 * Computes the parity without the missing data blocks
 * and store it in the buffers of such data blocks.
 *
 * This is the parity expressed as Pa,Qa,Ra,Sa,Ta,Ua in the equations.
 */
 void raid_delta_gen(int nr, int *id, int *ip, int nd, size_t size, void **v)
 {
 	void *p[RAID_PARITY_MAX];
 	void *pa[RAID_PARITY_MAX];
 	int i, j;
 	int np;
 	void *latest;
 	/* total number of parities we are going to process */
 	/* they are both the used and the unused ones */
 	np = ip[nr - 1] + 1;
 	/* latest missing data block */
 	latest = v[id[nr - 1]];
 	/* setup pointers for delta computation */
 	for (i = 0, j = 0; i < np; ++i) {
 		/* keep a copy of the original parity vector */
 		p[i] = v[nd + i];
 		if (ip[j] == i) {
 			/*
 			 * Set used parities to point to the missing
 			 * data blocks.
 			 *
 			 * The related data blocks are instead set
 			 * to point to the "zero" buffer.
 			 */
 			/* the latest parity to use ends the for loop and */
 			/* then it cannot happen to process more of them */
 			BUG_ON(j >= nr);
 			/* buffer for missing data blocks */
 			pa[j] = v[id[j]];
 			/* set at zero the missing data blocks */
 			v[id[j]] = raid_zero_block;
 			/* compute the parity over the missing data blocks */
 			v[nd + i] = pa[j];
 			/* check for the next used entry */
 			++j;
 		} else {
 			/*
 			 * Unused parities are going to be rewritten with
 			 * not significative data, becase we don't have
 			 * functions able to compute only a subset of
 			 * parities.
 			 *
 			 * To avoid this, we reuse parity buffers,
 			 * assuming that all the parity functions write
 			 * parities in order.
 			 *
 			 * We assign the unused parity block to the same
 			 * block of the latest used parity that we know it
 			 * will be written.
 			 *
 			 * This means that this block will be written
 			 * multiple times and only the latest write will
 			 * contain the correct data.
 			 */
 			v[nd + i] = latest;
 		}
 	}
 	/* all the parities have to be processed */
 	BUG_ON(j != nr);
 	/* recompute the parity, note that np may be smaller than the */
 	/* total number of parities available */
 	raid_gen(nd, np, size, v);
 	/* restore data buffers as before */
 	for (j = 0; j < nr; ++j)
 		v[id[j]] = pa[j];
 	/* restore parity buffers as before */
 	for (i = 0; i < np; ++i)
 		v[nd + i] = p[i];
 }
 /**
 * Recover failure of one data block for PAR1.
 *
 * Starting from the equation:
 *
 * Pd = Dx
 *
 * and solving we get:
 *
 * Dx = Pd
 */
 void raid_rec1of1(int *id, int nd, size_t size, void **v)
 {
 	void *p;
 	void *pa;
 	/* for PAR1 we can directly compute the missing block */
 	/* and we don't need to use the zero buffer */
 	p = v[nd];
 	pa = v[id[0]];
 	/* use the parity as missing data block */
 	v[id[0]] = p;
 	/* compute the parity over the missing data block */
 	v[nd] = pa;
 	/* compute */
 	raid_gen(nd, 1, size, v);
 	/* restore as before */
 	v[id[0]] = pa;
 	v[nd] = p;
 }
 /**
 * Recover failure of two data blocks for PAR2.
 *
 * Starting from the equations:
 *
 * Pd = Dx + Dy
 * Qd = 2^id[0] * Dx + 2^id[1] * Dy
 *
 * and solving we get:
 *
 *               1                     2^(-id[0])
 * Dy = ------------------- * Pd + ------------------- * Qd
 *      2^(id[1]-id[0]) + 1        2^(id[1]-id[0]) + 1
 *
 * Dx = Dy + Pd
 *
 * with conditions:
 *
 * 2^id[0] != 0
 * 2^(id[1]-id[0]) + 1 != 0
 *
 * That are always satisfied for any 0<=id[0]<id[1]<255.
 */
 void raid_rec2of2_int8(int *id, int *ip, int nd, size_t size, void **vv)
 {
 	uint8_t **v = (uint8_t **)vv;
 	size_t i;
 	uint8_t *p;
 	uint8_t *pa;
 	uint8_t *q;
 	uint8_t *qa;
 	const uint8_t *T[2];
 	/* get multiplication tables */
 	T[0] = table(inv(pow2(id[1] - id[0]) ^ 1));
 	T[1] = table(inv(pow2(id[0]) ^ pow2(id[1])));
 	/* compute delta parity */
 	raid_delta_gen(2, id, ip, nd, size, vv);
 	p = v[nd];
 	q = v[nd + 1];
 	pa = v[id[0]];
 	qa = v[id[1]];
 	for (i = 0; i < size; ++i) {
 		/* delta */
 		uint8_t Pd = p[i] ^ pa[i];
 		uint8_t Qd = q[i] ^ qa[i];
 		/* reconstruct */
 		uint8_t Dy = T[0][Pd] ^ T[1][Qd];
 		uint8_t Dx = Pd ^ Dy;
 		/* set */
 		pa[i] = Dx;
 		qa[i] = Dy;
 	}
 }
 /*
 * Forwarders for data recovery.
 *
 * These functions recover data blocks using the specified parity
 * to recompute the missing data.
 *
 * Note that the format of vectors @id/@ip is different than raid_rec().
 * For example, in the vector @ip the first parity is represented with the
 * value 0 and not @nd.
 *
 * @nr Number of failed data blocks to recover.
 * @id[] Vector of @nr indexes of the data blocks to recover.
 *   The indexes start from 0. They must be in order.
 * @ip[] Vector of @nr indexes of the parity blocks to use in the recovering.
 *   The indexes start from 0. They must be in order.
 * @nd Number of data blocks.
 * @np Number of parity blocks.
 * @size Size of the blocks pointed by @v. It must be a multipler of 64.
 * @v Vector of pointers to the blocks of data and parity.
 *   It has (@nd + @np) elements. The starting elements are the blocks
 *   for data, following with the parity blocks.
 *   Each block has @size bytes.
 */
 void (*raid_rec_ptr[RAID_PARITY_MAX])(
 	int nr, int *id, int *ip, int nd, size_t size, void **vv);
 void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v)
 {
 	int nrd; /* number of data blocks to recover */
 	int nrp; /* number of parity blocks to recover */
 	/* enforce limit on size */
 	BUG_ON(size % 64 != 0);
 	/* enforce limit on number of failures */
 	BUG_ON(nr > np);
 	BUG_ON(np > RAID_PARITY_MAX);
 	/* enforce order in index vector */
 	BUG_ON(nr >= 2 && ir[0] >= ir[1]);
 	BUG_ON(nr >= 3 && ir[1] >= ir[2]);
 	BUG_ON(nr >= 4 && ir[2] >= ir[3]);
 	BUG_ON(nr >= 5 && ir[3] >= ir[4]);
 	BUG_ON(nr >= 6 && ir[4] >= ir[5]);
 	/* enforce limit on index vector */
 	BUG_ON(nr > 0 && ir[nr-1] >= nd + np);
 	/* count the number of data blocks to recover */
 	nrd = 0;
 	while (nrd < nr && ir[nrd] < nd)
 		++nrd;
 	/* all the remaining are parity */
 	nrp = nr - nrd;
 	/* enforce limit on number of failures */
 	BUG_ON(nrd > nd);
 	BUG_ON(nrp > np);
 	/* if failed data is present */
 	if (nrd != 0) {
 		int ip[RAID_PARITY_MAX];
 		int i, j, k;
 		/* setup the vector of parities to use */
 		for (i = 0, j = 0, k = 0; i < np; ++i) {
 			if (j < nrp && ir[nrd + j] == nd + i) {
 				/* this parity has to be recovered */
 				++j;
 			} else {
 				/* this parity is used for recovering */
 				ip[k] = i;
 				++k;
 			}
 		}
 		/* recover the nrd data blocks specified in ir[], */
 		/* using the first nrd parity in ip[] for recovering */
 		raid_rec_ptr[nrd - 1](nrd, ir, ip, nd, size, v);
 	}
 	/* recompute all the parities up to the last bad one */
 	if (nrp != 0)
 		raid_gen(nd, ir[nr - 1] - nd + 1, size, v);
 }
 void raid_data(int nr, int *id, int *ip, int nd, size_t size, void **v)
 {
 	/* enforce limit on size */
 	BUG_ON(size % 64 != 0);
 	/* enforce limit on number of failures */
 	BUG_ON(nr > nd);
 	BUG_ON(nr > RAID_PARITY_MAX);
 	/* enforce order in index vector for data */
 	BUG_ON(nr >= 2 && id[0] >= id[1]);
 	BUG_ON(nr >= 3 && id[1] >= id[2]);
 	BUG_ON(nr >= 4 && id[2] >= id[3]);
 	BUG_ON(nr >= 5 && id[3] >= id[4]);
 	BUG_ON(nr >= 6 && id[4] >= id[5]);
 	/* enforce limit on index vector for data */
 	BUG_ON(nr > 0 && id[nr-1] >= nd);
 	/* enforce order in index vector for parity */
 	BUG_ON(nr >= 2 && ip[0] >= ip[1]);
 	BUG_ON(nr >= 3 && ip[1] >= ip[2]);
 	BUG_ON(nr >= 4 && ip[2] >= ip[3]);
 	BUG_ON(nr >= 5 && ip[3] >= ip[4]);
 	BUG_ON(nr >= 6 && ip[4] >= ip[5]);
 	/* if failed data is present */
 	if (nr != 0)
 		raid_rec_ptr[nr - 1](nr, id, ip, nd, size, v);
 }
--- a/raid/raid.h
+++ b/raid/raid.h
@ -0,0 +1,229 @@
 /*
 * Copyright (C) 2013 Andrea Mazzoleni
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #ifndef __RAID_H
 #define __RAID_H
 /**
 * RAID mode supporting up to 6 parities.
 *
 * It requires SSSE3 to get good performance with triple or more parities.
 *
 * This is the default mode set after calling raid_init().
 */
 #define RAID_MODE_CAUCHY 0
 /**
 * RAID mode supporting up to 3 parities,
 *
 * It has a fast triple parity implementation without SSSE3, but it cannot
 * go beyond triple parity.
 *
 * This is mostly intended for low end CPUs like ARM and AMD Athlon.
 */
 #define RAID_MODE_VANDERMONDE 1
 /**
 * Maximum number of parity disks supported.
 */
 #define RAID_PARITY_MAX 6
 /**
 * Maximum number of data disks supported.
 */
 #define RAID_DATA_MAX 251
 /**
 * Initializes the RAID system.
 *
 * You must call this function before any other.
 *
 * The RAID system is initialized in the RAID_MODE_CAUCHY mode.
 */
 void raid_init(void);
 /**
 * Runs a basic functionality self test.
 *
 * The test is immediate, and it's intended to be run at application
 * startup to check the integrity of the RAID system.
 *
 * It returns 0 on success.
 */
 int raid_selftest(void);
 /**
 * Sets the mode to use. One of RAID_MODE_*.
 *
 * You can change mode at any time, and it will affect next calls to raid_gen(),
 * raid_rec() and raid_data().
 *
 * The two modes are compatible for the first two levels of parity.
 * The third one is different.
 */
 void raid_mode(int mode);
 /**
 * Sets the zero buffer to use in recovering.
 *
 * Before calling raid_rec() and raid_data() you must provide a memory
 * buffer filled with zero with the same size of the blocks to recover.
 *
 * This buffer is only read and never written.
 */
 void raid_zero(void *zero);
 /**
 * Computes parity blocks.
 *
 * This function computes the specified number of parity blocks of the
 * provided set of data blocks.
 *
 * Each parity block allows to recover one data block.
 *
 * @nd Number of data blocks.
 * @np Number of parities blocks to compute.
 * @size Size of the blocks pointed by @v. It must be a multiplier of 64.
 * @v Vector of pointers to the blocks of data and parity.
 *   It has (@nd + @np) elements. The starting elements are the blocks for
 *   data, following with the parity blocks.
 *   Data blocks are only read and not modified. Parity blocks are written.
 *   Each block has @size bytes.
 */
 void raid_gen(int nd, int np, size_t size, void **v);
 /**
 * Recovers failures in data and parity blocks.
 *
 * This function recovers all the data and parity blocks marked as bad
 * in the @ir vector.
 *
 * Ensure to have @nr <= @np, otherwise recovering is not possible.
 *
 * The parities blocks used for recovering are automatically selected from
 * the ones NOT present in the @ir vector.
 *
 * In case there are more parity blocks than needed, the parities at lower
 * indexes are used in the recovering, and the others are ignored.
 *
 * Note that no internal integrity check is done when recovering. If the
 * provided parities are correct, the resulting data will be correct.
 * If parities are wrong, the resulting recovered data will be wrong.
 * This happens even in the case you have more parities blocks than needed,
 * and some form of integrity verification would be possible.
 *
 * @nr Number of failed data and parity blocks to recover.
 * @ir[] Vector of @nr indexes of the failed data and parity blocks.
 *   The indexes start from 0. They must be in order.
 *   The first parity is represented with value @nd, the second with value
 *   @nd + 1, just like positions in the @v vector.
 * @nd Number of data blocks.
 * @np Number of parity blocks.
 * @size Size of the blocks pointed by @v. It must be a multiplier of 64.
 * @v Vector of pointers to the blocks of data and parity.
 *   It has (@nd + @np) elements. The starting elements are the blocks
 *   for data, following with the parity blocks.
 *   Each block has @size bytes.
 */
 void raid_rec(int nr, int *ir, int nd, int np, size_t size, void **v);
 /**
 * Recovers failures in data blocks only.
 *
 * This function recovers all the data blocks marked as bad in the @id vector.
 * The parity blocks are not modified.
 *
 * @nr Number of failed data blocks to recover.
 * @id[] Vector of @nr indexes of the data blocks to recover.
 *   The indexes start from 0. They must be in order.
 * @ip[] Vector of @nr indexes of the parity blocks to use for recovering.
 *   The indexes start from 0. They must be in order.
 * @nd Number of data blocks.
 * @size Size of the blocks pointed by @v. It must be a multiplier of 64.
 * @v Vector of pointers to the blocks of data and parity.
 *   It has (@nd + @ip[@nr - 1] + 1) elements. The starting elements are the
 *   blocks for data, following with the parity blocks.
 *   Each blocks has @size bytes.
 */
 void raid_data(int nr, int *id, int *ip, int nd, size_t size, void **v);
 /**
 * Check the provided failed blocks combination.
 *
 * This function checks if the specified failed blocks combination satisfies
 * the redundancy information. A combination is assumed matching, if the
 * remaining valid parity is matching the expected value after recovering.
 *
 * The number of failed blocks @nr must be strictly less than the number of
 * parities @np, because you need one more parity to validate the recovering.
 *
 * No data or parity blocks are modified.
 *
 * @nr Number of failed data and parity blocks.
 * @ir[] Vector of @nr indexes of the failed data and parity blocks.
 *   The indexes start from 0. They must be in order.
 *   The first parity is represented with value @nd, the second with value
 *   @nd + 1, just like positions in the @v vector.
 * @nd Number of data blocks.
 * @np Number of parity blocks.
 * @size Size of the blocks pointed by @v. It must be a multiplier of 64.
 * @v Vector of pointers to the blocks of data and parity.
 *   It has (@nd + @np) elements. The starting elements are the blocks
 *   for data, following with the parity blocks.
 *   Each block has @size bytes.
 * @return 0 if the check is satisfied. -1 otherwise.
 */
 int raid_check(int nr, int *ir, int nd, int np, size_t size, void **v);
 /**
 * Scan for failed blocks.
 *
 * This function identifies the failed data and parity blocks using the
 * available redundancy.
 *
 * It uses a brute force method, and then the call can be expansive.
 * The expected execution time is proportional at the binomial coefficient
 * @np + @nd choose @np - 1, usually written as:
 *
 * ( @np + @nd )
 * (           )
 * (  @np - 1  )
 *
 * No data or parity blocks are modified.
 *
 * The failed block indexes are returned in the @ir vector.
 * It must have space for at least @np - 1 values.
 *
 * The returned @ir vector can then be used in a raid_rec() call to recover
 * the failed data and parity blocks.
 *
 * @ir[] Vector filled with the indexes of the failed data and parity blocks.
 *   The indexes start from 0 and they are in order.
 *   The first parity is represented with value @nd, the second with value
 *   @nd + 1, just like positions in the @v vector.
 * @nd Number of data blocks.
 * @np Number of parity blocks.
 * @size Size of the blocks pointed by @v. It must be a multiplier of 64.
 * @v Vector of pointers to the blocks of data and parity.
 *   It has (@nd + @np) elements. The starting elements are the blocks
 *   for data, following with the parity blocks.
 *   Each block has @size bytes.
 * @return Number of block indexes returned in the @ir vector.
 *   0 if no error is detected.
 *   -1 if it's not possible to identify the failed disks.
 */
 int raid_scan(int *ir, int nd, int np, size_t size, void **v);
 #endif
--- a/raid/tables.c
+++ b/raid/tables.c
--- a/raid/tag.c
+++ b/raid/tag.c
@ -0,0 +1,145 @@
 /*
 * Copyright (C) 2013 Andrea Mazzoleni
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #include "internal.h"
 static struct raid_func {
 	const char *name;
 	void (*p)();
 } RAID_FUNC[] = {
 	{ "int8", raid_gen3_int8 },
 	{ "int8", raid_gen4_int8 },
 	{ "int8", raid_gen5_int8 },
 	{ "int8", raid_gen6_int8 },
 	{ "int32", raid_gen1_int32 },
 	{ "int64", raid_gen1_int64 },
 	{ "int32", raid_gen2_int32 },
 	{ "int64", raid_gen2_int64 },
 	{ "int32", raid_genz_int32 },
 	{ "int64", raid_genz_int64 },
 	{ "int8", raid_rec1_int8 },
 	{ "int8", raid_rec2_int8 },
 	{ "int8", raid_recX_int8 },
 #ifdef CONFIG_X86
 #ifdef CONFIG_SSE2
 	{ "sse2", raid_gen1_sse2 },
 	{ "sse2", raid_gen2_sse2 },
 	{ "sse2", raid_genz_sse2 },
 #endif
 #ifdef CONFIG_SSSE3
 	{ "ssse3", raid_gen3_ssse3 },
 	{ "ssse3", raid_gen4_ssse3 },
 	{ "ssse3", raid_gen5_ssse3 },
 	{ "ssse3", raid_gen6_ssse3 },
 	{ "ssse3", raid_rec1_ssse3 },
 	{ "ssse3", raid_rec2_ssse3 },
 	{ "ssse3", raid_recX_ssse3 },
 #endif
 #ifdef CONFIG_AVX2
 	{ "avx2", raid_gen1_avx2 },
 	{ "avx2", raid_gen2_avx2 },
 	{ "avx2", raid_rec1_avx2 },
 	{ "avx2", raid_rec2_avx2 },
 	{ "avx2", raid_recX_avx2 },
 #endif
 #endif
 #ifdef CONFIG_X86_64
 #ifdef CONFIG_SSE2
 	{ "sse2e", raid_gen2_sse2ext },
 	{ "sse2e", raid_genz_sse2ext },
 #endif
 #ifdef CONFIG_SSSE3
 	{ "ssse3e", raid_gen3_ssse3ext },
 	{ "ssse3e", raid_gen4_ssse3ext },
 	{ "ssse3e", raid_gen5_ssse3ext },
 	{ "ssse3e", raid_gen6_ssse3ext },
 #endif
 #ifdef CONFIG_AVX2
 	{ "avx2e", raid_gen3_avx2ext },
 	{ "avx2e", raid_genz_avx2ext },
 	{ "avx2e", raid_gen4_avx2ext },
 	{ "avx2e", raid_gen5_avx2ext },
 	{ "avx2e", raid_gen6_avx2ext },
 #endif
 #endif
 	{ 0, 0 }
 };
 static const char *raid_tag(void (*func)())
 {
 	struct raid_func *i = RAID_FUNC;
 	while (i->name != 0) {
 		if (i->p == func)
 			return i->name;
 		++i;
 	}
 	/* LCOV_EXCL_START */
 	return "unknown";
 	/* LCOV_EXCL_STOP */
 }
 const char *raid_gen1_tag(void)
 {
 	return raid_tag(raid_gen_ptr[0]);
 }
 const char *raid_gen2_tag(void)
 {
 	return raid_tag(raid_gen_ptr[1]);
 }
 const char *raid_genz_tag(void)
 {
 	return raid_tag(raid_genz_ptr);
 }
 const char *raid_gen3_tag(void)
 {
 	return raid_tag(raid_gen_ptr[2]);
 }
 const char *raid_gen4_tag(void)
 {
 	return raid_tag(raid_gen_ptr[3]);
 }
 const char *raid_gen5_tag(void)
 {
 	return raid_tag(raid_gen_ptr[4]);
 }
 const char *raid_gen6_tag(void)
 {
 	return raid_tag(raid_gen_ptr[5]);
 }
 const char *raid_rec1_tag(void)
 {
 	return raid_tag(raid_rec_ptr[0]);
 }
 const char *raid_rec2_tag(void)
 {
 	return raid_tag(raid_rec_ptr[1]);
 }
 const char *raid_recX_tag(void)
 {
 	return raid_tag(raid_rec_ptr[2]);
 }
--- a/raid/test.c
+++ b/raid/test.c
@ -0,0 +1,452 @@
 /*
 * Copyright (C) 2013 Andrea Mazzoleni
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #include "internal.h"
 #include "cpu.h"
 #include "combo.h"
 #include "memory.h"
 /**
 * Binomial coefficient of n over r.
 */
 static int ibc(int n, int r)
 {
 	if (r == 0 || n == r)
 		return 1;
 	else
 		return ibc(n - 1, r - 1) + ibc(n - 1, r);
 }
 /**
 * Power n ^ r;
 */
 static int ipow(int n, int r)
 {
 	int v = 1;
 	while (r) {
 		v *= n;
 		--r;
 	}
 	return v;
 }
 int raid_test_combo(void)
 {
 	int r;
 	int count;
 	int p[RAID_PARITY_MAX];
 	for (r = 1; r <= RAID_PARITY_MAX; ++r) {
 		/* count combination (r of RAID_PARITY_MAX) elements */
 		count = 0;
 		combination_first(r, RAID_PARITY_MAX, p);
 		do {
 			++count;
 		} while (combination_next(r, RAID_PARITY_MAX, p));
 		if (count != ibc(RAID_PARITY_MAX, r)) {
 			/* LCOV_EXCL_START */
 			return -1;
 			/* LCOV_EXCL_STOP */
 		}
 	}
 	for (r = 1; r <= RAID_PARITY_MAX; ++r) {
 		/* count permutation (r of RAID_PARITY_MAX) elements */
 		count = 0;
 		permutation_first(r, RAID_PARITY_MAX, p);
 		do {
 			++count;
 		} while (permutation_next(r, RAID_PARITY_MAX, p));
 		if (count != ipow(RAID_PARITY_MAX, r)) {
 			/* LCOV_EXCL_START */
 			return -1;
 			/* LCOV_EXCL_STOP */
 		}
 	}
 	return 0;
 }
 int raid_test_insert(void)
 {
 	int p[RAID_PARITY_MAX];
 	int r;
 	for (r = 1; r <= RAID_PARITY_MAX; ++r) {
 		permutation_first(r, RAID_PARITY_MAX, p);
 		do {
 			int i[RAID_PARITY_MAX];
 			int j;
 			/* insert in order */
 			for (j = 0; j < r; ++j)
 				raid_insert(j, i, p[j]);
 			/* check order */
 			for (j = 1; j < r; ++j) {
 				if (i[j - 1] > i[j]) {
 					/* LCOV_EXCL_START */
 					return -1;
 					/* LCOV_EXCL_STOP */
 				}
 			}
 		} while (permutation_next(r, RAID_PARITY_MAX, p));
 	}
 	return 0;
 }
 int raid_test_sort(void)
 {
 	int p[RAID_PARITY_MAX];
 	int r;
 	for (r = 1; r <= RAID_PARITY_MAX; ++r) {
 		permutation_first(r, RAID_PARITY_MAX, p);
 		do {
 			int i[RAID_PARITY_MAX];
 			int j;
 			/* make a copy */
 			for (j = 0; j < r; ++j)
 				i[j] = p[j];
 			raid_sort(r, i);
 			/* check order */
 			for (j = 1; j < r; ++j) {
 				if (i[j - 1] > i[j]) {
 					/* LCOV_EXCL_START */
 					return -1;
 					/* LCOV_EXCL_STOP */
 				}
 			}
 		} while (permutation_next(r, RAID_PARITY_MAX, p));
 	}
 	return 0;
 }
 int raid_test_rec(int mode, int nd, size_t size)
 {
 	void (*f[RAID_PARITY_MAX][4])(
 		int nr, int *id, int *ip, int nd, size_t size, void **vbuf);
 	void *v_alloc;
 	void **v;
 	void **data;
 	void **parity;
 	void **test;
 	void *data_save[RAID_PARITY_MAX];
 	void *parity_save[RAID_PARITY_MAX];
 	void *waste;
 	int nv;
 	int id[RAID_PARITY_MAX];
 	int ip[RAID_PARITY_MAX];
 	int i;
 	int j;
 	int nr;
 	int nf[RAID_PARITY_MAX];
 	int np;
 	raid_mode(mode);
 	if (mode == RAID_MODE_CAUCHY)
 		np = RAID_PARITY_MAX;
 	else
 		np = 3;
 	nv = nd + np * 2 + 2;
 	v = raid_malloc_vector(nd, nv, size, &v_alloc);
 	if (!v) {
 		/* LCOV_EXCL_START */
 		return -1;
 		/* LCOV_EXCL_STOP */
 	}
 	data = v;
 	parity = v + nd;
 	test = v + nd + np;
 	for (i = 0; i < np; ++i)
 		parity_save[i] = parity[i];
 	memset(v[nv - 2], 0, size);
 	raid_zero(v[nv - 2]);
 	waste = v[nv - 1];
 	/* fill with pseudo-random data with the arbitrary seed "1" */
 	raid_mrand_vector(1, nd, size, v);
 	/* setup recov functions */
 	for (i = 0; i < np; ++i) {
 		nf[i] = 0;
 		if (i == 0) {
 			f[i][nf[i]++] = raid_rec1_int8;
 #ifdef CONFIG_X86
 #ifdef CONFIG_SSSE3
 			if (raid_cpu_has_ssse3())
 				f[i][nf[i]++] = raid_rec1_ssse3;
 #endif
 #ifdef CONFIG_AVX2
 			if (raid_cpu_has_avx2())
 				f[i][nf[i]++] = raid_rec1_avx2;
 #endif
 #endif
 		} else if (i == 1) {
 			f[i][nf[i]++] = raid_rec2_int8;
 #ifdef CONFIG_X86
 #ifdef CONFIG_SSSE3
 			if (raid_cpu_has_ssse3())
 				f[i][nf[i]++] = raid_rec2_ssse3;
 #endif
 #ifdef CONFIG_AVX2
 			if (raid_cpu_has_avx2())
 				f[i][nf[i]++] = raid_rec2_avx2;
 #endif
 #endif
 		} else {
 			f[i][nf[i]++] = raid_recX_int8;
 #ifdef CONFIG_X86
 #ifdef CONFIG_SSSE3
 			if (raid_cpu_has_ssse3())
 				f[i][nf[i]++] = raid_recX_ssse3;
 #endif
 #ifdef CONFIG_AVX2
 			if (raid_cpu_has_avx2())
 				f[i][nf[i]++] = raid_recX_avx2;
 #endif
 #endif
 		}
 	}
 	/* compute the parity */
 	raid_gen_ref(nd, np, size, v);
 	/* set all the parity to the waste v */
 	for (i = 0; i < np; ++i)
 		parity[i] = waste;
 	/* all parity levels */
 	for (nr = 1; nr <= np; ++nr) {
 		/* all combinations (nr of nd) disks */
 		combination_first(nr, nd, id);
 		do {
 			/* all combinations (nr of np) parities */
 			combination_first(nr, np, ip);
 			do {
 				/* for each recover function */
 				for (j = 0; j < nf[nr - 1]; ++j) {
 					/* set */
 					for (i = 0; i < nr; ++i) {
 						/* remove the missing data */
 						data_save[i] = data[id[i]];
 						data[id[i]] = test[i];
 						/* set the parity to use */
 						parity[ip[i]] = parity_save[ip[i]];
 					}
 					/* recover */
 					f[nr - 1][j](nr, id, ip, nd, size, v);
 					/* check */
 					for (i = 0; i < nr; ++i) {
 						if (memcmp(test[i], data_save[i], size) != 0) {
 							/* LCOV_EXCL_START */
 							goto bail;
 							/* LCOV_EXCL_STOP */
 						}
 					}
 					/* restore */
 					for (i = 0; i < nr; ++i) {
 						/* restore the data */
 						data[id[i]] = data_save[i];
 						/* restore the parity */
 						parity[ip[i]] = waste;
 					}
 				}
 			} while (combination_next(nr, np, ip));
 		} while (combination_next(nr, nd, id));
 	}
 	free(v_alloc);
 	free(v);
 	return 0;
 bail:
 	/* LCOV_EXCL_START */
 	free(v_alloc);
 	free(v);
 	return -1;
 	/* LCOV_EXCL_STOP */
 }
 int raid_test_par(int mode, int nd, size_t size)
 {
 	void (*f[64])(int nd, size_t size, void **vbuf);
 	void *v_alloc;
 	void **v;
 	int nv;
 	int i, j;
 	int nf;
 	int np;
 	raid_mode(mode);
 	if (mode == RAID_MODE_CAUCHY)
 		np = RAID_PARITY_MAX;
 	else
 		np = 3;
 	nv = nd + np * 2;
 	v = raid_malloc_vector(nd, nv, size, &v_alloc);
 	if (!v) {
 		/* LCOV_EXCL_START */
 		return -1;
 		/* LCOV_EXCL_STOP */
 	}
 	/* check memory */
 	if (raid_mtest_vector(nv, size, v) != 0) {
 		/* LCOV_EXCL_START */
 		goto bail;
 		/* LCOV_EXCL_STOP */
 	}
 	/* fill with pseudo-random data with the arbitrary seed "2" */
 	raid_mrand_vector(2, nv, size, v);
 	/* compute the parity */
 	raid_gen_ref(nd, np, size, v);
 	/* copy in back buffers */
 	for (i = 0; i < np; ++i)
 		memcpy(v[nd + np + i], v[nd + i], size);
 	/* load all the available functions */
 	nf = 0;
 	f[nf++] = raid_gen1_int32;
 	f[nf++] = raid_gen1_int64;
 	f[nf++] = raid_gen2_int32;
 	f[nf++] = raid_gen2_int64;
 #ifdef CONFIG_X86
 #ifdef CONFIG_SSE2
 	if (raid_cpu_has_sse2()) {
 		f[nf++] = raid_gen1_sse2;
 		f[nf++] = raid_gen2_sse2;
 #ifdef CONFIG_X86_64
 		f[nf++] = raid_gen2_sse2ext;
 #endif
 	}
 #endif
 #ifdef CONFIG_AVX2
 	if (raid_cpu_has_avx2()) {
 		f[nf++] = raid_gen1_avx2;
 		f[nf++] = raid_gen2_avx2;
 	}
 #endif
 #endif /* CONFIG_X86 */
 	if (mode == RAID_MODE_CAUCHY) {
 		f[nf++] = raid_gen3_int8;
 		f[nf++] = raid_gen4_int8;
 		f[nf++] = raid_gen5_int8;
 		f[nf++] = raid_gen6_int8;
 #ifdef CONFIG_X86
 #ifdef CONFIG_SSSE3
 		if (raid_cpu_has_ssse3()) {
 			f[nf++] = raid_gen3_ssse3;
 			f[nf++] = raid_gen4_ssse3;
 			f[nf++] = raid_gen5_ssse3;
 			f[nf++] = raid_gen6_ssse3;
 #ifdef CONFIG_X86_64
 			f[nf++] = raid_gen3_ssse3ext;
 			f[nf++] = raid_gen4_ssse3ext;
 			f[nf++] = raid_gen5_ssse3ext;
 			f[nf++] = raid_gen6_ssse3ext;
 #endif
 		}
 #endif
 #ifdef CONFIG_AVX2
 #ifdef CONFIG_X86_64
 		if (raid_cpu_has_avx2()) {
 			f[nf++] = raid_gen3_avx2ext;
 			f[nf++] = raid_gen4_avx2ext;
 			f[nf++] = raid_gen5_avx2ext;
 			f[nf++] = raid_gen6_avx2ext;
 		}
 #endif
 #endif
 #endif /* CONFIG_X86 */
 	} else {
 		f[nf++] = raid_genz_int32;
 		f[nf++] = raid_genz_int64;
 #ifdef CONFIG_X86
 #ifdef CONFIG_SSE2
 		if (raid_cpu_has_sse2()) {
 			f[nf++] = raid_genz_sse2;
 #ifdef CONFIG_X86_64
 			f[nf++] = raid_genz_sse2ext;
 #endif
 		}
 #endif
 #ifdef CONFIG_AVX2
 #ifdef CONFIG_X86_64
 		if (raid_cpu_has_avx2())
 			f[nf++] = raid_genz_avx2ext;
 #endif
 #endif
 #endif /* CONFIG_X86 */
 	}
 	/* check all the functions */
 	for (j = 0; j < nf; ++j) {
 		/* compute parity */
 		f[j](nd, size, v);
 		/* check it */
 		for (i = 0; i < np; ++i) {
 			if (memcmp(v[nd + np + i], v[nd + i], size) != 0) {
 				/* LCOV_EXCL_START */
 				goto bail;
 				/* LCOV_EXCL_STOP */
 			}
 		}
 	}
 	free(v_alloc);
 	free(v);
 	return 0;
 bail:
 	/* LCOV_EXCL_START */
 	free(v_alloc);
 	free(v);
 	return -1;
 	/* LCOV_EXCL_STOP */
 }
--- a/raid/test.h
+++ b/raid/test.h
@ -0,0 +1,68 @@
 /*
 * Copyright (C) 2013 Andrea Mazzoleni
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #ifndef __RAID_TEST_H
 #define __RAID_TEST_H
 /**
 * Tests insertion function.
 *
 * Test raid_insert() with all the possible combinations of elements to insert.
 *
 * Returns 0 on success.
 */
 int raid_test_insert(void);
 /**
 * Tests sorting function.
 *
 * Test raid_sort() with all the possible combinations of elements to sort.
 *
 * Returns 0 on success.
 */
 int raid_test_sort(void);
 /**
 * Tests combination functions.
 *
 * Tests combination_first() and combination_next() for all the parity levels.
 *
 * Returns 0 on success.
 */
 int raid_test_combo(void);
 /**
 * Tests recovering functions.
 *
 * All the recovering functions are tested with all the combinations
 * of failing disks and recovering parities.
 *
 * Take care that the test time grows exponentially with the number of disks.
 *
 * Returns 0 on success.
 */
 int raid_test_rec(unsigned mode, int nd, size_t size);
 /**
 * Tests parity generation functions.
 *
 * All the parity generation functions are tested with the specified
 * number of disks.
 *
 * Returns 0 on success.
 */
 int raid_test_par(unsigned mode, int nd, size_t size);
 #endif
--- a/raid/x86.c
+++ b/raid/x86.c
--- a/raid/x86z.c
+++ b/raid/x86z.c
@ -0,0 +1,255 @@
 /*
 * Copyright (C) 2013 Andrea Mazzoleni
 *
 * This program is free software: you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation, either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 */
 #include "internal.h"
 #if defined(CONFIG_X86) && defined(CONFIG_SSE2)
 static const struct gfzconst16 {
 	uint8_t poly[16];
 	uint8_t half[16];
 	uint8_t low7[16];
 } gfzconst16 __aligned(64) =
 {
 	{
 		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d,
 		0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d
 	},
 	{
 		0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e,
 		0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e, 0x8e
 	},
 	{
 		0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f,
 		0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f, 0x7f
 	}
 };
 #endif
 #if defined(CONFIG_X86) && defined(CONFIG_SSE2)
 /*
 * GENz (triple parity with powers of 2^-1) SSE2 implementation
 */
 void raid_genz_sse2(int nd, size_t size, void **vv)
 {
 	uint8_t **v = (uint8_t**)vv;
 	uint8_t *p;
 	uint8_t *q;
 	uint8_t *r;
 	int d, l;
 	size_t i;
 	l = nd - 1;
 	p = v[nd];
 	q = v[nd + 1];
 	r = v[nd + 2];
 	raid_sse_begin();
 	asm volatile ("movdqa %0,%%xmm7" : : "m" (gfzconst16.poly[0]));
 	asm volatile ("movdqa %0,%%xmm3" : : "m" (gfzconst16.half[0]));
 	asm volatile ("movdqa %0,%%xmm6" : : "m" (gfzconst16.low7[0]));
 	for (i = 0; i < size; i += 16) {
 		asm volatile ("movdqa %0,%%xmm0" : : "m" (v[l][i]));
 		asm volatile ("movdqa %xmm0,%xmm1");
 		asm volatile ("movdqa %xmm0,%xmm2");
 		for (d = l - 1; d >= 0; --d) {
 			asm volatile ("pxor %xmm4,%xmm4");
 			asm volatile ("pcmpgtb %xmm1,%xmm4");
 			asm volatile ("paddb %xmm1,%xmm1");
 			asm volatile ("pand %xmm7,%xmm4");
 			asm volatile ("pxor %xmm4,%xmm1");
 			asm volatile ("movdqa %xmm2,%xmm4");
 			asm volatile ("pxor %xmm5,%xmm5");
 			asm volatile ("psllw $7,%xmm4");
 			asm volatile ("psrlw $1,%xmm2");
 			asm volatile ("pcmpgtb %xmm4,%xmm5");
 			asm volatile ("pand %xmm6,%xmm2");
 			asm volatile ("pand %xmm3,%xmm5");
 			asm volatile ("pxor %xmm5,%xmm2");
 			asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
 			asm volatile ("pxor %xmm4,%xmm0");
 			asm volatile ("pxor %xmm4,%xmm1");
 			asm volatile ("pxor %xmm4,%xmm2");
 		}
 		asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
 		asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
 		asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
 	}
 	raid_sse_end();
 }
 #endif
 #if defined(CONFIG_X86_64) && defined(CONFIG_SSE2)
 /*
 * GENz (triple parity with powers of 2^-1) SSE2 implementation
 *
 * Note that it uses 16 registers, meaning that x64 is required.
 */
 void raid_genz_sse2ext(int nd, size_t size, void **vv)
 {
 	uint8_t **v = (uint8_t**)vv;
 	uint8_t *p;
 	uint8_t *q;
 	uint8_t *r;
 	int d, l;
 	size_t i;
 	l = nd - 1;
 	p = v[nd];
 	q = v[nd + 1];
 	r = v[nd + 2];
 	raid_sse_begin();
 	asm volatile ("movdqa %0,%%xmm7" : : "m" (gfzconst16.poly[0]));
 	asm volatile ("movdqa %0,%%xmm3" : : "m" (gfzconst16.half[0]));
 	asm volatile ("movdqa %0,%%xmm11" : : "m" (gfzconst16.low7[0]));
 	for (i = 0; i < size; i += 32) {
 		asm volatile ("movdqa %0,%%xmm0" : : "m" (v[l][i]));
 		asm volatile ("movdqa %0,%%xmm8" : : "m" (v[l][i + 16]));
 		asm volatile ("movdqa %xmm0,%xmm1");
 		asm volatile ("movdqa %xmm8,%xmm9");
 		asm volatile ("movdqa %xmm0,%xmm2");
 		asm volatile ("movdqa %xmm8,%xmm10");
 		for (d = l - 1; d >= 0; --d) {
 			asm volatile ("movdqa %xmm2,%xmm6");
 			asm volatile ("movdqa %xmm10,%xmm14");
 			asm volatile ("pxor %xmm4,%xmm4");
 			asm volatile ("pxor %xmm12,%xmm12");
 			asm volatile ("pxor %xmm5,%xmm5");
 			asm volatile ("pxor %xmm13,%xmm13");
 			asm volatile ("psllw $7,%xmm6");
 			asm volatile ("psllw $7,%xmm14");
 			asm volatile ("psrlw $1,%xmm2");
 			asm volatile ("psrlw $1,%xmm10");
 			asm volatile ("pcmpgtb %xmm1,%xmm4");
 			asm volatile ("pcmpgtb %xmm9,%xmm12");
 			asm volatile ("pcmpgtb %xmm6,%xmm5");
 			asm volatile ("pcmpgtb %xmm14,%xmm13");
 			asm volatile ("paddb %xmm1,%xmm1");
 			asm volatile ("paddb %xmm9,%xmm9");
 			asm volatile ("pand %xmm11,%xmm2");
 			asm volatile ("pand %xmm11,%xmm10");
 			asm volatile ("pand %xmm7,%xmm4");
 			asm volatile ("pand %xmm7,%xmm12");
 			asm volatile ("pand %xmm3,%xmm5");
 			asm volatile ("pand %xmm3,%xmm13");
 			asm volatile ("pxor %xmm4,%xmm1");
 			asm volatile ("pxor %xmm12,%xmm9");
 			asm volatile ("pxor %xmm5,%xmm2");
 			asm volatile ("pxor %xmm13,%xmm10");
 			asm volatile ("movdqa %0,%%xmm4" : : "m" (v[d][i]));
 			asm volatile ("movdqa %0,%%xmm12" : : "m" (v[d][i + 16]));
 			asm volatile ("pxor %xmm4,%xmm0");
 			asm volatile ("pxor %xmm4,%xmm1");
 			asm volatile ("pxor %xmm4,%xmm2");
 			asm volatile ("pxor %xmm12,%xmm8");
 			asm volatile ("pxor %xmm12,%xmm9");
 			asm volatile ("pxor %xmm12,%xmm10");
 		}
 		asm volatile ("movntdq %%xmm0,%0" : "=m" (p[i]));
 		asm volatile ("movntdq %%xmm8,%0" : "=m" (p[i + 16]));
 		asm volatile ("movntdq %%xmm1,%0" : "=m" (q[i]));
 		asm volatile ("movntdq %%xmm9,%0" : "=m" (q[i + 16]));
 		asm volatile ("movntdq %%xmm2,%0" : "=m" (r[i]));
 		asm volatile ("movntdq %%xmm10,%0" : "=m" (r[i + 16]));
 	}
 	raid_sse_end();
 }
 #endif
 #if defined(CONFIG_X86_64) && defined(CONFIG_AVX2)
 /*
 * GENz (triple parity with powers of 2^-1) AVX2 implementation
 *
 * Note that it uses 16 registers, meaning that x64 is required.
 */
 void raid_genz_avx2ext(int nd, size_t size, void **vv)
 {
 	uint8_t **v = (uint8_t**)vv;
 	uint8_t *p;
 	uint8_t *q;
 	uint8_t *r;
 	int d, l;
 	size_t i;
 	l = nd - 1;
 	p = v[nd];
 	q = v[nd + 1];
 	r = v[nd + 2];
 	raid_avx_begin();
 	asm volatile ("vbroadcasti128 %0,%%ymm7" : : "m" (gfzconst16.poly[0]));
 	asm volatile ("vbroadcasti128 %0,%%ymm3" : : "m" (gfzconst16.half[0]));
 	asm volatile ("vbroadcasti128 %0,%%ymm11" : : "m" (gfzconst16.low7[0]));
 	asm volatile ("vpxor %ymm15,%ymm15,%ymm15");
 	for (i = 0; i < size; i += 64) {
 		asm volatile ("vmovdqa %0,%%ymm0" : : "m" (v[l][i]));
 		asm volatile ("vmovdqa %0,%%ymm8" : : "m" (v[l][i + 32]));
 		asm volatile ("vmovdqa %ymm0,%ymm1");
 		asm volatile ("vmovdqa %ymm8,%ymm9");
 		asm volatile ("vmovdqa %ymm0,%ymm2");
 		asm volatile ("vmovdqa %ymm8,%ymm10");
 		for (d = l - 1; d >= 0; --d) {
 			asm volatile ("vpsllw $7,%ymm2,%ymm6");
 			asm volatile ("vpsllw $7,%ymm10,%ymm14");
 			asm volatile ("vpsrlw $1,%ymm2,%ymm2");
 			asm volatile ("vpsrlw $1,%ymm10,%ymm10");
 			asm volatile ("vpcmpgtb %ymm1,%ymm15,%ymm4");
 			asm volatile ("vpcmpgtb %ymm9,%ymm15,%ymm12");
 			asm volatile ("vpcmpgtb %ymm6,%ymm15,%ymm5");
 			asm volatile ("vpcmpgtb %ymm14,%ymm15,%ymm13");
 			asm volatile ("vpaddb %ymm1,%ymm1,%ymm1");
 			asm volatile ("vpaddb %ymm9,%ymm9,%ymm9");
 			asm volatile ("vpand %ymm11,%ymm2,%ymm2");
 			asm volatile ("vpand %ymm11,%ymm10,%ymm10");
 			asm volatile ("vpand %ymm7,%ymm4,%ymm4");
 			asm volatile ("vpand %ymm7,%ymm12,%ymm12");
 			asm volatile ("vpand %ymm3,%ymm5,%ymm5");
 			asm volatile ("vpand %ymm3,%ymm13,%ymm13");
 			asm volatile ("vpxor %ymm4,%ymm1,%ymm1");
 			asm volatile ("vpxor %ymm12,%ymm9,%ymm9");
 			asm volatile ("vpxor %ymm5,%ymm2,%ymm2");
 			asm volatile ("vpxor %ymm13,%ymm10,%ymm10");
 			asm volatile ("vmovdqa %0,%%ymm4" : : "m" (v[d][i]));
 			asm volatile ("vmovdqa %0,%%ymm12" : : "m" (v[d][i + 32]));
 			asm volatile ("vpxor %ymm4,%ymm0,%ymm0");
 			asm volatile ("vpxor %ymm4,%ymm1,%ymm1");
 			asm volatile ("vpxor %ymm4,%ymm2,%ymm2");
 			asm volatile ("vpxor %ymm12,%ymm8,%ymm8");
 			asm volatile ("vpxor %ymm12,%ymm9,%ymm9");
 			asm volatile ("vpxor %ymm12,%ymm10,%ymm10");
 		}
 		asm volatile ("vmovntdq %%ymm0,%0" : "=m" (p[i]));
 		asm volatile ("vmovntdq %%ymm8,%0" : "=m" (p[i + 32]));
 		asm volatile ("vmovntdq %%ymm1,%0" : "=m" (q[i]));
 		asm volatile ("vmovntdq %%ymm9,%0" : "=m" (q[i + 32]));
 		asm volatile ("vmovntdq %%ymm2,%0" : "=m" (r[i]));
 		asm volatile ("vmovntdq %%ymm10,%0" : "=m" (r[i + 32]));
 	}
 	raid_avx_end();
 }
 #endif