Re: JOIN with GROUP BY, without PRIMARY KEY
I've tried the queries on a large table A. Here's the script.
#drop table if exists A; drop table if exists B;
create table A (id int, val char(3), key (id), key (val), key id_val (id,val));
create table B (gr int, val char(3), score int, primary key (gr, val), key (val), unique gr_val_sc (gr, val, score));
insert into B values (1, 'one', 1),(1, 'two', 2),(2, 'one', 3),(2, 'two', 4);
Then, I've run the following script 200000 times to fill table A.
insert into A select floor(rand()*4000),
case rand() < 0.5 when 1 then 'one' else 'two' end;
Here are the results:
# Option 0
reset query cache;
select A.id, B.gr, sum(B.score)
from A join B on A.val = B.val
group by A.id, B.gr;
# 8000 rows in set (1.91 sec)
# Option 1
reset query cache;
select A.id, B.gr, sum(B.score)
from A join B where B.gr = 1 and B.val = A.val
group by A.id;
#4000 rows in set (0.78 sec)
reset query cache;
select A.id, B.gr, sum(B.score)
from A join B where B.gr = 2 and B.val = A.val
group by A.id;
#4000 rows in set (0.78 sec)
#TOTAL: 1.56 sec
# Option 2
reset query cache;
select A.id, B.gr, sum(B.score)
from A force index (id) join B on A.val = B.val where B.gr = 1
group by A.id limit 200000;
#4000 rows in set (0.28 sec)
reset query cache;
select A.id, B.gr, sum(B.score)
from A force index (id) join B on A.val = B.val where B.gr = 2
group by A.id limit 200000;
#4000 rows in set (1.02 sec)
#TOTAL: 1.30 sec
Although Option 2 is slightly faster, I'm still not convinced that this is the optimal solution. It does help to condition the query on B.gr, but it's still too slow. Plus, I find very strange that Option 2 is slower than Option 1 when B.gr = 2. Can anyone explain that?
Thanks.