MYSQL - Split Data Into Multiple Rows
I have imported some data using an application that collects info from IMDB and transfers them into a MYSQL database.
It seems the fields have not been normalized and contained many values within 1 field
For example:
Table Movie
MovieID Movie_Title Written_By
1 Movie1 Person1, Person2
2 Movie2 Person3
3 Movie3 Person4, Person2, Person6
Is there way to separate the values and have them inserted into another table to something like this and without any duplicates?
Table Writers
WriterID Written_By MovieId
1 Person1 1
2 Person2 1
3 Person3 3
I did some googling and found that I am supposed to process this data using PHP. But I have no knowledge with PHP at all.
Is there anyway to convert this data using just MYSQL?
You can use a stored procedure which uses a cursor to solve this but it's not very elegant but neither is a comma separated list of writers !
Had the following code lying around from a similar question but you'd better check it thoroughly.
Hope it helps :)
mysql> select * from movies_unf;
+---------+-------------+------------------------------------------------------+
| movieID | movie_title | written_by |
+---------+-------------+------------------------------------------------------+
| 1 | movie1 | person1, person2 |
| 2 | movie2 | person3 |
| 3 | movie3 | person4, person2, person6 |
| 4 | movie4 | person4, person4, person1, person2, person1,person8, |
| 5 | movie1 | person1, person2 |
+---------+-------------+------------------------------------------------------+
5 rows in set (0.00 sec)
call normalise_movies_unf();
mysql> select * from movies;
+----------+--------+
| movie_id | title |
+----------+--------+
| 1 | movie1 |
| 2 | movie2 |
| 3 | movie3 |
| 4 | movie4 |
+----------+--------+
4 rows in set (0.00 sec)
mysql> select * from writers;
+-----------+---------+
| writer_id | name |
+-----------+---------+
| 1 | person1 |
| 2 | person2 |
| 3 | person3 |
| 4 | person4 |
| 6 | person6 |
| 12 | person8 |
+-----------+---------+
6 rows in set (0.00 sec)
mysql> select * from movie_writers;
+----------+-----------+
| movie_id | writer_id |
+----------+-----------+
| 1 | 1 |
| 1 | 2 |
| 2 | 3 |
| 3 | 2 |
| 3 | 4 |
| 3 | 6 |
| 4 | 1 |
| 4 | 2 |
| 4 | 4 |
| 4 | 12 |
+----------+-----------+
10 rows in set (0.00 sec)
Example tables
drop table if exists movies_unf;
create table movies_unf
(
movieID int unsigned not null primary key,
movie_title varchar(255) not null,
written_by varchar(1024) not null
)engine=innodb;
insert into movies_unf values
(1,'movie1','person1, person2'),
(2,'movie2','person3'),
(3,'movie3','person4, person2, person6'),
(4,'movie4','person4, person4, person1, person2, person1,person8,'), -- dodgy writers
(5,'movie1','person1, person2'); -- dodgy movie
drop table if exists movies;
create table movies
(
movie_id int unsigned not null auto_increment primary key,
title varchar(255) unique not null
)engine=innodb;
drop table if exists writers;
create table writers
(
writer_id int unsigned not null auto_increment primary key,
name varchar(255) unique not null
)engine=innodb;
drop table if exists movie_writers;
create table movie_writers
(
movie_id int unsigned not null,
writer_id int unsigned not null,
primary key (movie_id, writer_id)
)engine=innodb;
Stored procedure
drop procedure if exists normalise_movies_unf;
delimiter #
create procedure normalise_movies_unf()
begin
declare v_movieID int unsigned default 0;
declare v_movie_title varchar(255);
declare v_writers varchar(1024);
declare v_movie_id int unsigned default 0;
declare v_writer_id int unsigned default 0;
declare v_name varchar(255);
declare v_csv_done tinyint unsigned default 0;
declare v_csv_idx int unsigned default 0;
declare v_done tinyint default 0;
declare v_cursor cursor for
select distinct movieID, movie_title, written_by from movies_unf;
declare continue handler for not found set v_done = 1;
start transaction;
open v_cursor;
repeat
fetch v_cursor into v_movieID, v_movie_title, v_writers;
set v_movie_title = trim(v_movie_title);
set v_writers = replace(v_writers,' ', '');
-- insert the movie
insert ignore into movies (title) values (v_movie_title);
select movie_id into v_movie_id from movies where title = v_movie_title;
-- split the out the writers and insert
set v_csv_done = 0;
set v_csv_idx = 1;
while not v_csv_done do
set v_name = substring(v_writers, v_csv_idx,
if(locate(',', v_writers, v_csv_idx) > 0,
locate(',', v_writers, v_csv_idx) - v_csv_idx,
length(v_writers)));
set v_name = trim(v_name);
if length(v_name) > 0 then
set v_csv_idx = v_csv_idx + length(v_name) + 1;
insert ignore into writers (name) values (v_name);
select writer_id into v_writer_id from writers where name = v_name;
insert ignore into movie_writers (movie_id, writer_id) values (v_movie_id, v_writer_id);
else
set v_csv_done = 1;
end if;
end while;
until v_done end repeat;
close v_cursor;
commit;
truncate table movies_unf;
end#
delimiter ;
EDIT
Amended sproc so that it doesnt skip key values !
drop procedure if exists normalise_movies_unf;
delimiter #
create procedure normalise_movies_unf()
begin
declare v_movieID int unsigned default 0;
declare v_movie_title varchar(255);
declare v_writers varchar(1024);
declare v_movie_id int unsigned default 0;
declare v_writer_id int unsigned default 0;
declare v_name varchar(255);
declare v_csv_done tinyint unsigned default 0;
declare v_csv_idx int unsigned default 0;
declare v_done tinyint default 0;
declare v_cursor cursor for
select distinct movieID, movie_title, written_by from movies_unf;
declare continue handler for not found set v_done = 1;
start transaction;
open v_cursor;
repeat
fetch v_cursor into v_movieID, v_movie_title, v_writers;
set v_movie_title = trim(v_movie_title);
set v_writers = replace(v_writers,' ', '');
-- insert the movie
if not exists (select 1 from movies where title = v_movie_title) then
insert ignore into movies (title) values (v_movie_title);
end if;
select movie_id into v_movie_id from movies where title = v_movie_title;
-- split the out the writers and insert
set v_csv_done = 0;
set v_csv_idx = 1;
while not v_csv_done do
set v_name = substring(v_writers, v_csv_idx,
if(locate(',', v_writers, v_csv_idx) > 0,
locate(',', v_writers, v_csv_idx) - v_csv_idx,
length(v_writers)));
set v_name = trim(v_name);
if length(v_name) > 0 then
set v_csv_idx = v_csv_idx + length(v_name) + 1;
if not exists (select 1 from writers where name = v_name) then
insert ignore into writers (name) values (v_name);
end if;
select writer_id into v_writer_id from writers where name = v_name;
insert ignore into movie_writers (movie_id, writer_id) values (v_movie_id, v_writer_id);
else
set v_csv_done = 1;
end if;
end while;
until v_done end repeat;
close v_cursor;
commit;
truncate table movies_unf;
end#
delimiter ;