\documentclass[10pt]{article}
%\title{}
%\author{}
%\date{}
\usepackage{algorithm}
\usepackage{algorithmic}
\usepackage{a4}
\usepackage{color}
\renewcommand\floatpagefraction{0.99}
\renewcommand\topfraction{0.99}
\renewcommand\bottomfraction{0.99}
\renewcommand\textfraction{.05}
\setcounter{totalnumber}{5}
%\setlength{\parindent}{0pt}
%\pdfpagewidth 8.5in
%\pdfpageheight 11in
\setlength\topmargin{0in}
\setlength\headheight{0in}
\setlength\headsep{0in}
\setlength\textheight{9.0in}
\setlength\textwidth{6.5in}
\setlength\oddsidemargin{0in}
\setlength\evensidemargin{0in}

\newtheorem{example}{Example}[section]
\newcommand{\from}[2]{{\bf[{\sc from #1:} #2]}}
\begin{document}

\section{Annotation constraints}
How to annotate the above tdataset using ``key yes'', ``distinct yes''
and ``identifying yes''.

\begin{itemize}
\item The constraint ``key yes'' is applied to a measurement type in the
  annotation. If a measurement type of an observation type is denoted
  as {\em key yes}, it means the measurement value of one observation instance is the key
  for this observation's entity instance. If two observation instances have the same
  measurement value, they are two different different observation
  instances from the {\em same entity instance}.  
 The measurement types of one observationo type which are specified
 with ``key yes'' are also called the {\bf key measurement}(s) of this
 observation type.  
\item The constraint ``distinct yes'' is applied to an observation
  type in the annotation. It need to be used together with ``key
  yes'' to catch the semantics of {\em same observation instance}. 
  I.e., if an observation type is specified with ``distinct yes'' but
  it does not have any key measurement types. The annotation is
  invalid. Besides this, we also have the following {\bf Rule 1}
  between the ``key yes'' and ``distinct yes'' constraints. I.e., when an observation type is denoted with
  ``distinct yes'', all of its measurement types should be specified
  with ``key yes''. Otherwise, there would be inconsistency. We show
  it using an example. 
\item The constraint ``identifying yes'' is applied to an context type
  description in annotation. It is used to identify that the
  uniqueness of one entity instance is not only decided by its own
  key measurements, but also decided by its context key mensurements. 
 Based on its semantic measning, the following {\bf Rule 2} is applied. 
\end{itemize}

\begin{itemize}
\item {\bf Rule 1}: If an observation type is specified with {\em distinct yes}, all
  its measurement types are automatically marked with {\em key yes}. 
\item {\bf Rule 2}: If the context of an observation type is specified with {\em identifying
    yes}. This observation type MUST have some key measurement
  types. And the context observation type also MUST have key
  measurement types too. 
\end{itemize} 


Given the data in Table \ref{tb:pltarea}(b) and the annotation in
Figure \ref{fig:annot1}, the following example shows why {\bf Rule} 1 is needed. 


According to the annotation, if two plots have the same value for
``PlotName'', they represent the same plot observation. 
Obviously, it has problem to interprete the data in Table
\ref{tb:pltarea} with this annotatioin. 
E.g., the first and second rows catch information about plot with
EntityName $A$. Accoring to the annotation, they should be the same
plot obervation. 
However, the data shows that this one plot has two different areas
$1.0$ and $1.1$, so, there is confusion here.

For this case, we can have several ways to make the annoation
consistent. 
\begin{itemize}
\item The first way is: when an observation type is denoted by
  ``distinct yes'', all its measurement types should be denoted with
  ``key yes'' automatically. 
\item The second way is: get ride of the onstraint ``distinct
  yes''. This way, when all the measurements are denoted with ``key
  yes'', implicitly, the same entity instance only corresponds to the
  same observation instance.  \from{HP}{I PREFER THIS ONE. }
\end{itemize}
For the above example, the right way to annotate the data is to get
rid of the {\em distinct yes}. So that the same PlotName can points to
the same plot entity, but the different areas mean that the two
different observations get two different values.  

\begin{table}[htb]
\begin{center}
\begin{tabular}{cc}
\begin{tabular}{|l|l|l|}
\hline
plt & spp & dbh\\\hline
A & piru & 35.8 \\\hline
A & piru & 36.2 \\\hline
B & piru &33.2 \\\hline
B&abba&34\\\hline
\end{tabular}
&
\begin{tabular}{|l|l|l|l|}
\hline
plt & area & spp & dbh\\\hline
A & 1.0 & piru & 35.8 \\\hline
A & 1.1 & piru & 36.2 \\\hline
B & 2.0 &piru &33.2 \\\hline
B& 2.0 & abba&34\\\hline
\end{tabular}\\
(a) & (b)
\end{tabular}
\end{center}
\vspace{-0.2in}
\caption{Dataset}
\label{tb:pltarea}
\end{table}

\begin{figure}[htb]
\begin{tabular}{ll}
\begin{minipage}{0.5\linewidth} 
{\bf observation} ``o1''  \textcolor{blue}{distinct yes}\\
\verb|    |{\bf  entity} ``Plot''\\
\verb|    |{\bf measurement} ``m1'' \textcolor{blue}{key yes}\\
\verb|        | {\bf characteristic} ``PlotName'' \\
\verb|        | {\bf standard} ``Nominal''\\
{\bf observation} ``o2'' \\
\verb|    |{\bf  entity} ``Tree''\\
\verb|    |{\bf measurement} ``m3'' \textcolor{blue}{key yes}\\
\verb|        | {\bf characteristic} ``TreeName'' \\
\verb|        | {\bf standard} ``TaxonomicName''\\
\verb|    | {\bf measurement} ``m4''\\
\verb|        | {\bf characteristic} ``DBH'' \\
\verb|        | {\bf standard} ``Centimeter'''\\
\verb|    |{\bf context identifying yes} ``o1''\\
{\bf map} ``plt'' to ``m1''\\
{\bf map} ``spp'' to ``m3''\\
{\bf map} ``dbh'' to ``m4''\\
\end{minipage}
&
\begin{minipage}{0.5\linewidth} 
\noindent
{\bf observation} ``o1''\\
\verb|    |{\bf  entity} ``Plot''\\
\verb|    |{\bf measurement} ``m1'' \textcolor{blue}{key yes}\\
\verb|        | {\bf characteristic} ``PlotName'' \\
\verb|        | {\bf standard} ``Nominal''\\
\verb|    | {\bf measurement} ``m2''\\
\verb|        | {\bf characteristic} ``area'' \\
\verb|        | {\bf standard} ``sqft''\\
{\bf observation} ``o2'' \\
\verb|    |{\bf  entity} ``Tree''\\
\verb|    |{\bf measurement} ``m3'' \textcolor{blue}{key yes}\\
\verb|        | {\bf characteristic} ``TreeName'' \\
\verb|        | {\bf standard} ``TaxonomicName''\\
\verb|    | {\bf measurement} ``m4''\\
\verb|        | {\bf characteristic} ``DBH'' \\
\verb|        | {\bf standard} ``Centimeter'''\\
\verb|    |{\bf context identifying yes} ``o1''\\
{\bf map} ``plt'' to ``m1''\\
{\bf map} ``area'' to ``m2''\\
{\bf map} ``spp'' to ``m3''\\
{\bf map} ``dbh'' to ``m4''\\
\end{minipage}
\\
(a)&(b)\\
\end{tabular}
\caption{Annotation of Table  \ref{tb:pltarea}}
\label{fig:annot1}
\end{figure}



\section{A more complex use case}


\noindent  Given the dataset in Table \ref{tb:complexdb}, users have different situations to catch. 
\begin{itemize}
\item Requirement 1: {\em Plot} with label ``1'' should refer to the same one physical plot (i.e., the Plot in the first 4 row means the same thing); similarly, 
  {\em plot} with label ``2'' should refer to the second physical plot (i.e., the Plot in the last 4 row means the same thing). 
\begin{itemize} 
\item This can be captured in annotation by putting ``Distinct yes'' for observation type {\em Plot} and ``key yes'' for its measurement type {\em PlotLabel}. 
\end{itemize}
\item Requirement 2: {\em sub-plot}s with the same lable should refer to the same physical sub-plot if they are within the same plot; 
but the sub-plot with the same label with different {\em Plot} label are different sub-plots.
E.g., Row 1 \{Plot=1, sub-plot=A\} refers the same sub-plot as that in Row 2, but is different from the one in Row 5 \{Plot=2, sub-plot=A\}. 
\begin{itemize} 
\item This can be captured by putting ``Distinct yes'' for observation type {\em SubPlot} , ``key yes'' for its measurement type {\em SubPlotLabel}. We need to denote {\em Plot} is its context and with {\em identifying yes} specified on this context. 
\end{itemize}
\item Requirement 3: {\em Tmnt} with the same lable should refer to the same treatment process (So that we can aggregate on different treatment process, e.g., on ``X'' or on ``C''.)  But the treatment at different sub-plot should refer to different treatment. 
 \begin{itemize} 
\item The first requirement can  be captured by treating all the
  Treatment with value ``X'' as the different entity instances with
  the same type (TmntType). The second requirement can be captured by
  treating the treatments in different sub-plots as observations of
  the different entity instances.  I.e., treatments in row 1 and row 3
  are of but are different observation instances which are of
  different entity instances. 
\item {\em Tmnt} has the context {\em sub-plot} identified with
  ``identifying yes''. 
%We don't need to specify ``key yes'', ``distinct yes'', or ``identifying yes''. 
%At the first glance, to represent this, {\em TmntType} for {\em Tmnt} should be specified with ``key yes''. 
%It should have context {\em sub-plot} which is specified with ``identifying yes''. 
%\item After further analysis, {\bf  one question arises: 
%The key measurements for the treatment observation is different from the key measurement  of the entity treatment}. 
%After considering the context, the key measurements for the treatment observation are \{Plotlabel, SubPlotLabel, TmntType\}.  
%When two rows have the same value on these measurements, they represent the same observation instance.
%However, the key measurement for the treatment entity is just \{TmntType\}. When two rows have the same value on it, they represent %the same entity instance. 
%The {\em identifying} constraint can only capture the observation context. 
%{\bf This problem is more obvious when we analyze Case 5}. 
%\item Another different annotation may be applied to catch this semantic. E.g., treat the {\em treatments} in different rows as different entity instances. 
%This way, the observation type and the entity type have the same key measurement types \{Plotlabel, SubPlotLabel, TmntType\}. 
%However, this problem still exists for Case 4 and Case 5. 

Summarization questions: \\
Give me the average weight of the individuals with treatment ``X''. 
How can this question be answered after the annotation and materialization? 
This need to be answered after we annotate {\em Sp}, {\em Ind}, and
{\em wt}. 
The group by should be only on {\em Tmnt}. 
This can be expressed using the OM query with a condition in the
$avg(Tmnt)$, but not $avg(distinct Tmnt)$. 
constraint. 
\end{itemize}

%\item Requirement 4:  {\em Sp} with the same name should refer to the same species (e.g., a bird named {\em Aus} flies from sub-plot (1,A) to (1,B).)  
%But the {\em Sp} with the same name at different sub-plot should refer to different observations of a specie.
%\begin{itemize}
%\item At the first glance, to represent this, {\em SpName} for {\em Sp} should be specified with ``key yes''. 
%It should have context {\em sub-plot} which is specified with ``identifying yes''. 
%\item {\bf The same problem as Case 3: The key measurements for the {\em Sp} observation is different from the key measurement  of the entity {\em Sp}}.
%the key measurements for the species observation are \{Plotlabel, SubPlotLabel, SpName\}. 
%However, the key measurements for the species entity is just \{SpName\}. 
%\end{itemize} 

\item Requirement 4:  {\em Ind} with the same label and and the same
  species name (Sp) should refer to the same bird observation.
But the individual (with the same label and the same species name) at different sub-plot should refer to different bird observations.
%\begin{itemize}
%\item {\bf The same problem as Case 4 and Case 5: The key measurements for the {\em Ind} observation is different from the key measurement  of the entity {\em Ind}.}
%the key measurements for the species observation are \{Plotlabel, SubPlotLabel, SpName, Ind\}. 
%However, the key measurement for the species entity is just \{SpName, Ind\}.  When two rows have the same value on these two columns, they represent the same entity instance. 
%In this case, the observation context of Ind is {\em Sp} and {\em Sub-plot}. But the entity context of Ind is just {\em Sp}. 
%\end{itemize} 
\end{itemize}

The annotation can be done as follows. 

\begin{tabular}{ll}
\begin{minipage}{0.5\linewidth} 
%\begin{table}[htb]
\begin{center}
\begin{tabular}{|l|l|l|l|l|l|}
\hline
Plot & sub-plot & Tmnt & Sp & Ind &wt\\\hline
1 & A & X &Aus&1&10\\\hline
1 & A & C &Bus&1&20\\\hline
1 & B & X &Aus&3&10\\\hline
1 & B & C &Bus&4&10\\\hline
2 & A & X &Aus&1&20\\\hline
2 & A & C &Bus&4&10\\\hline
2 & B & X &Aus&5&20\\\hline
2 & B & C &Bus&4&10\\\hline
\end{tabular}
\end{center}
\vspace{-0.2in}
%\caption{A dataset with more complex information}
%\label{tb:complexdb}
%\end{table}
\end{minipage}
&
\begin{minipage}{0.45\linewidth} 
%\begin{figure}[htb]
\noindent
{\bf observation} ``o1''\\
\verb|    |{\bf  entity} ``Plot''\\
\verb|    |{\bf measurement} ``m1'' \textcolor{blue}{key yes}\\
\verb|        | {\bf characteristic} ``PlotLabel'' \\
\verb|        | {\bf standard} ``Nominal''\\
{\bf observation} ``o2''\\
\verb|    |{\bf  entity} ``SubPlot''\\
\verb|    |{\bf measurement} ``m2'' \textcolor{blue}{key yes}\\
\verb|        | {\bf characteristic} ``SubPlotLabel'' \\
\verb|        | {\bf standard} ``Nominal''\\
\verb|    | {\bf context identifying yes} observation ``o1''\\
{\bf observation} ``o3''\\
\verb|    |{\bf  entity} ``Treatment''\\
\verb|    |{\bf measurement} ``m3'' \textcolor{blue}{key yes}\\
\verb|        | {\bf characteristic} ``Procedure'' \\
\verb|        | {\bf standard} ``Nominal''\\
\verb|    | {\bf context identifying yes} observation ``o2''\\
{\bf observation} ``o4''\\
\verb|    |{\bf  entity} ``Bird''\\
\verb|    |{\bf measurement} ``m4'' \textcolor{blue}{key yes}\\
\verb|        | {\bf characteristic} ``Species'' \\
\verb|        | {\bf standard} ``TaxonomicBirdName''\\
\verb|    |{\bf measurement} ``m5'' \textcolor{blue}{key yes}\\
\verb|        | {\bf characteristic} ``Individual'' \\
\verb|        | {\bf standard} ``Nominal''\\
\verb|    |{\bf measurement} ``m6'' \\
\verb|        | {\bf characteristic} ``weight'' \\
\verb|        | {\bf standard} ``kg''\\
\verb|    | {\bf context identifying yes} observation ``o2''\\
{\bf map} ``Plot'' to ``m1''\\
{\bf map} ``Sub-plot'' to ``m2''\\
{\bf map} ``Tmnt'' to ``m3''\\
{\bf map} ``Sp'' to ``m4''\\
{\bf map} ``Ind'' to ``m5''\\
{\bf map} ``wt'' to ``m6''\\
%\caption{Annotation to Table \ref{tb:complexdb}}
%\label{fig:annot2}
%\vspace{-0.2in}
%\end{figure}
\end{minipage}\\
A dataset with more complex information & Annotation

\end{tabular}


% \begin{itemize}
% \item Observation type: Plot {\em distinct yes}
%   \begin{itemize}
%   \item Measurement type: PlotLabel {\em key yes}
%   \end{itemize}
% \item Observation type: SubPlot {\em distinct yes}
%   \begin{itemize}
%     \item Measurement type: SubPlotLabel {\em key yes}
%       \item Context: Plot {\em identifying yes}. This means that the
%         key measurement for SubPlot (SubPlotLabel) and the key
%         measurement for Plot (PlotLabel) together form the key of
%         SubPlot observation (and its related entity too.)
%       \end{itemize}
% \item Observation type: Treatment {\em distinct yes}
%   \begin{itemize}
%     \item EntityType: TreatmentProcedure 
%     \item Measurement type: TmntType {\em key yes}
%    \item Context: SubPlot {\em Identifying yes}
%   \end{itemize}
% \item Observation type: SpeciesIndividual
%   \begin{itemize}
%       \item Measurement type: Species, {\em key yes}
%       \item Measurement type: IndLabel (represent {\em Individual}), {\em key yes} 
%       \item Measurement type: Weight
%       \item Context: SubPlot. Note: (1) if the users want to treat the
%         individual (with the same lavel and the same species) at
%         different sub-plots as different observation instances, here,
%         {\em identifying yes} should be specified. Otherwise, (2) if the users want to treat the
%         individual (with the same lavel and the same species) at
%         different sub-plots as SAME observation instances, here, we
%         don't need to specify {\em  identifying yes}.
%    \end{itemize} 
% \end{itemize}

%In summary, we can get a better idea about the problem described in
%the above use cases can when we answer the following two simple
%questions: \\
%Q1:  Will an entity type and an observation type (which is of the
%given entity type) always have the same key measurement type(s)?
%     The above use cases give situations that the answer is no.  \\
%Q2: is {\em identifying} itself enough to distinguish the key
%measurement for observation types and for entity types? 
%    My temporary answer to this question is no. 

%A general thinking: 
%the counterpart in RDB (Relational DataBase) is a relational scheme with key attributes. 
%Here, we have two levels of objects: entity level and instance level.
%Then, for different levels of objects, we need to have different ways to specify their key measurements. 

%\newpage




\end{document} 

